From 0ae67d029c4bc191f14a2c34ccad2a4c670c23ee Mon Sep 17 00:00:00 2001 From: Kévin Le Gouguec Date: Sat, 20 Mar 2021 18:06:41 +0100 Subject: Start analyzing visits --- admin/stats/util.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 admin/stats/util.py diff --git a/admin/stats/util.py b/admin/stats/util.py new file mode 100644 index 0000000..ced69d9 --- /dev/null +++ b/admin/stats/util.py @@ -0,0 +1,87 @@ +from dataclasses import dataclass +from datetime import datetime +from itertools import groupby +import re +from sys import argv + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P
\S+)', + r'\S+', + r'\S+', + r'(?P\[\S+ \S+\])', + r'"GET (?P\S+) [^"]+"', + r'200 [0-9]+', + r'"(?P[^"]+)"', + r'"(?P[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + + +@dataclass +class Access: + address: str + useragent: str + referer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + return cls( + info['address'], user_agents.parse(info['useragent']), + info['referer'], datetime.strptime(info['date'], DATE_FMT), + info['resource'] + ) + +def interesting(resource): + return ( + resource.endswith('.html') + or resource == '/' + or resource.startswith('/?') + ) + +def parse(logs_path): + with open(logs_path) as logs_file: + logs = logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if m is not None and interesting(m['resource']) + ) + +def key(access): + return f'{access.address} / {access.useragent}' + +def group(accesses): + groups = {} + for k, g in groupby(sorted(accesses, key=key), key=key): + groups[k] = tuple( + (a.time, a.resource, a.referer) + for a in g + ) + return groups + +def span(visits): + return (visits[0][0], + visits[-1][0]-visits[0][0]) + +def dump(logs_path): + for k, accesses in group(parse(logs_path)).items(): + print(k) + + t, duration = span(accesses) + print(t, duration) + + for (_, rsrc, ref) in accesses: + print(f'{rsrc} {ref}') + + print() + + +if __name__ == '__main__': + dump(argv[1]) -- cgit v1.2.3