summaryrefslogtreecommitdiff
path: root/admin/stats/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'admin/stats/util.py')
-rw-r--r--admin/stats/util.py87
1 files changed, 87 insertions, 0 deletions
diff --git a/admin/stats/util.py b/admin/stats/util.py
new file mode 100644
index 0000000..ced69d9
--- /dev/null
+++ b/admin/stats/util.py
@@ -0,0 +1,87 @@
+from dataclasses import dataclass
+from datetime import datetime
+from itertools import groupby
+import re
+from sys import argv
+
+import user_agents
+
+
+ACCESS_RE = re.compile(' '.join((
+ r'(?P<address>\S+)',
+ r'\S+',
+ r'\S+',
+ r'(?P<date>\[\S+ \S+\])',
+ r'"GET (?P<resource>\S+) [^"]+"',
+ r'200 [0-9]+',
+ r'"(?P<referer>[^"]+)"',
+ r'"(?P<useragent>[^"]+)"'
+)))
+
+DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
+
+
+@dataclass
+class Access:
+ address: str
+ useragent: str
+ referer: str
+ time: datetime
+ resource: str
+
+ @classmethod
+ def from_log(cls, info):
+ return cls(
+ info['address'], user_agents.parse(info['useragent']),
+ info['referer'], datetime.strptime(info['date'], DATE_FMT),
+ info['resource']
+ )
+
+def interesting(resource):
+ return (
+ resource.endswith('.html')
+ or resource == '/'
+ or resource.startswith('/?')
+ )
+
+def parse(logs_path):
+ with open(logs_path) as logs_file:
+ logs = logs_file.read().splitlines()
+
+ matches = (ACCESS_RE.match(l) for l in logs)
+ return tuple(
+ Access.from_log(m) for m in matches
+ if m is not None and interesting(m['resource'])
+ )
+
+def key(access):
+ return f'{access.address} / {access.useragent}'
+
+def group(accesses):
+ groups = {}
+ for k, g in groupby(sorted(accesses, key=key), key=key):
+ groups[k] = tuple(
+ (a.time, a.resource, a.referer)
+ for a in g
+ )
+ return groups
+
+def span(visits):
+ return (visits[0][0],
+ visits[-1][0]-visits[0][0])
+
+def dump(logs_path):
+ for k, accesses in group(parse(logs_path)).items():
+ print(k)
+
+ t, duration = span(accesses)
+ print(t, duration)
+
+ for (_, rsrc, ref) in accesses:
+ print(f'{rsrc} {ref}')
+
+ print()
+
+
+if __name__ == '__main__':
+ dump(argv[1])