diff options
Diffstat (limited to 'admin/stats.py')
| -rw-r--r-- | admin/stats.py | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/admin/stats.py b/admin/stats.py new file mode 100644 index 0000000..e79784c --- /dev/null +++ b/admin/stats.py @@ -0,0 +1,106 @@ +from collections import Counter, defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta +import re +from sys import argv + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P<address>\S+)', + r'\S+', + r'\S+', + r'(?P<date>\[\S+ \S+\])', + r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"(?P<referer>[^"]+)"', + r'"(?P<useragent>[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + +VISIT_MAX_DURATION = timedelta(hours=1) + + +@dataclass +class Access: + address: str + useragent: str + referer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + return cls( + info['address'], user_agents.parse(info['useragent']), + info['referer'], datetime.strptime(info['date'], DATE_FMT), + info['resource'] + ) + +def interesting(resource): + return ( + resource.endswith('.html') + or resource == '/' + ) + +def parse(logs_path): + with open(logs_path) as logs_file: + logs = logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if m is not None and interesting(m['resource']) + ) + +def key(access): + return f'{access.address} / {access.useragent}' + +def visits(accesses): + # Map (IP, user agent) to list of visits. A visit is a list of + # accesses. When processing an access, if the previous time for + # this (IP, user agent) is less than VISIT_MAX_DURATION seconds + # ago, we aggregate it, otherwise, we start a new visit. + visits = defaultdict(list) + + for access in accesses: + visitor = key(access) + + if visitor in visits: + last_access = visits[visitor][-1][-1].time + + if access.time - last_access < VISIT_MAX_DURATION: + visits[visitor][-1].append(access) + continue + + visits[visitor].append([access]) + + return visits + +def order(grouped_visits): + # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. + visits = {} + + for i, i_visits in grouped_visits.items(): + for v in i_visits: + visits[(i, v[0].time)] = v + + return visits + +def analyze(logs_path): + accesses = parse(logs_path) + visits_by_visitor = visits(accesses) + visits_by_time = order(visits_by_visitor) + + print('Visiteurs :', len(visits_by_visitor)) + print('Visites :', len(visits_by_time)) + + pagehits = Counter(a.resource for a in accesses) + for page, hits in pagehits.most_common(): + print(hits, page) + + +if __name__ == '__main__': + analyze(argv[1]) |
