summaryrefslogtreecommitdiff
path: root/admin/stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'admin/stats.py')
-rw-r--r--admin/stats.py106
1 files changed, 106 insertions, 0 deletions
diff --git a/admin/stats.py b/admin/stats.py
new file mode 100644
index 0000000..e79784c
--- /dev/null
+++ b/admin/stats.py
@@ -0,0 +1,106 @@
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+import re
+from sys import argv
+
+import user_agents
+
+
+ACCESS_RE = re.compile(' '.join((
+ r'(?P<address>\S+)',
+ r'\S+',
+ r'\S+',
+ r'(?P<date>\[\S+ \S+\])',
+ r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
+ r'200 [0-9]+',
+ r'"(?P<referer>[^"]+)"',
+ r'"(?P<useragent>[^"]+)"'
+)))
+
+DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
+
+VISIT_MAX_DURATION = timedelta(hours=1)
+
+
+@dataclass
+class Access:
+ address: str
+ useragent: str
+ referer: str
+ time: datetime
+ resource: str
+
+ @classmethod
+ def from_log(cls, info):
+ return cls(
+ info['address'], user_agents.parse(info['useragent']),
+ info['referer'], datetime.strptime(info['date'], DATE_FMT),
+ info['resource']
+ )
+
+def interesting(resource):
+ return (
+ resource.endswith('.html')
+ or resource == '/'
+ )
+
+def parse(logs_path):
+ with open(logs_path) as logs_file:
+ logs = logs_file.read().splitlines()
+
+ matches = (ACCESS_RE.match(l) for l in logs)
+ return tuple(
+ Access.from_log(m) for m in matches
+ if m is not None and interesting(m['resource'])
+ )
+
+def key(access):
+ return f'{access.address} / {access.useragent}'
+
+def visits(accesses):
+ # Map (IP, user agent) to list of visits. A visit is a list of
+ # accesses. When processing an access, if the previous time for
+ # this (IP, user agent) is less than VISIT_MAX_DURATION seconds
+ # ago, we aggregate it, otherwise, we start a new visit.
+ visits = defaultdict(list)
+
+ for access in accesses:
+ visitor = key(access)
+
+ if visitor in visits:
+ last_access = visits[visitor][-1][-1].time
+
+ if access.time - last_access < VISIT_MAX_DURATION:
+ visits[visitor][-1].append(access)
+ continue
+
+ visits[visitor].append([access])
+
+ return visits
+
+def order(grouped_visits):
+ # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
+ visits = {}
+
+ for i, i_visits in grouped_visits.items():
+ for v in i_visits:
+ visits[(i, v[0].time)] = v
+
+ return visits
+
+def analyze(logs_path):
+ accesses = parse(logs_path)
+ visits_by_visitor = visits(accesses)
+ visits_by_time = order(visits_by_visitor)
+
+ print('Visiteurs :', len(visits_by_visitor))
+ print('Visites :', len(visits_by_time))
+
+ pagehits = Counter(a.resource for a in accesses)
+ for page, hits in pagehits.most_common():
+ print(hits, page)
+
+
+if __name__ == '__main__':
+ analyze(argv[1])