summaryrefslogtreecommitdiff
path: root/admin/stats
diff options
context:
space:
mode:
Diffstat (limited to 'admin/stats')
-rw-r--r--admin/stats/util.py106
1 files changed, 0 insertions, 106 deletions
diff --git a/admin/stats/util.py b/admin/stats/util.py
deleted file mode 100644
index e79784c..0000000
--- a/admin/stats/util.py
+++ /dev/null
@@ -1,106 +0,0 @@
-from collections import Counter, defaultdict
-from dataclasses import dataclass
-from datetime import datetime, timedelta
-import re
-from sys import argv
-
-import user_agents
-
-
-ACCESS_RE = re.compile(' '.join((
- r'(?P<address>\S+)',
- r'\S+',
- r'\S+',
- r'(?P<date>\[\S+ \S+\])',
- r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
- r'200 [0-9]+',
- r'"(?P<referer>[^"]+)"',
- r'"(?P<useragent>[^"]+)"'
-)))
-
-DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
-
-VISIT_MAX_DURATION = timedelta(hours=1)
-
-
-@dataclass
-class Access:
- address: str
- useragent: str
- referer: str
- time: datetime
- resource: str
-
- @classmethod
- def from_log(cls, info):
- return cls(
- info['address'], user_agents.parse(info['useragent']),
- info['referer'], datetime.strptime(info['date'], DATE_FMT),
- info['resource']
- )
-
-def interesting(resource):
- return (
- resource.endswith('.html')
- or resource == '/'
- )
-
-def parse(logs_path):
- with open(logs_path) as logs_file:
- logs = logs_file.read().splitlines()
-
- matches = (ACCESS_RE.match(l) for l in logs)
- return tuple(
- Access.from_log(m) for m in matches
- if m is not None and interesting(m['resource'])
- )
-
-def key(access):
- return f'{access.address} / {access.useragent}'
-
-def visits(accesses):
- # Map (IP, user agent) to list of visits. A visit is a list of
- # accesses. When processing an access, if the previous time for
- # this (IP, user agent) is less than VISIT_MAX_DURATION seconds
- # ago, we aggregate it, otherwise, we start a new visit.
- visits = defaultdict(list)
-
- for access in accesses:
- visitor = key(access)
-
- if visitor in visits:
- last_access = visits[visitor][-1][-1].time
-
- if access.time - last_access < VISIT_MAX_DURATION:
- visits[visitor][-1].append(access)
- continue
-
- visits[visitor].append([access])
-
- return visits
-
-def order(grouped_visits):
- # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
- visits = {}
-
- for i, i_visits in grouped_visits.items():
- for v in i_visits:
- visits[(i, v[0].time)] = v
-
- return visits
-
-def analyze(logs_path):
- accesses = parse(logs_path)
- visits_by_visitor = visits(accesses)
- visits_by_time = order(visits_by_visitor)
-
- print('Visiteurs :', len(visits_by_visitor))
- print('Visites :', len(visits_by_time))
-
- pagehits = Counter(a.resource for a in accesses)
- for page, hits in pagehits.most_common():
- print(hits, page)
-
-
-if __name__ == '__main__':
- analyze(argv[1])