From f05afdd1262eff637e80c30d8af6e27c6d6cf072 Mon Sep 17 00:00:00 2001 From: Kévin Le Gouguec Date: Sat, 20 Mar 2021 19:29:54 +0100 Subject: Start dumping some stats --- admin/stats/util.py | 67 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'admin/stats/util.py') diff --git a/admin/stats/util.py b/admin/stats/util.py index ced69d9..e79784c 100644 --- a/admin/stats/util.py +++ b/admin/stats/util.py @@ -1,6 +1,6 @@ +from collections import Counter, defaultdict from dataclasses import dataclass -from datetime import datetime -from itertools import groupby +from datetime import datetime, timedelta import re from sys import argv @@ -12,7 +12,7 @@ ACCESS_RE = re.compile(' '.join(( r'\S+', r'\S+', r'(?P\[\S+ \S+\])', - r'"GET (?P\S+) [^"]+"', + r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', r'200 [0-9]+', r'"(?P[^"]+)"', r'"(?P[^"]+)"' @@ -20,6 +20,8 @@ ACCESS_RE = re.compile(' '.join(( DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' +VISIT_MAX_DURATION = timedelta(hours=1) + @dataclass class Access: @@ -41,7 +43,6 @@ def interesting(resource): return ( resource.endswith('.html') or resource == '/' - or resource.startswith('/?') ) def parse(logs_path): @@ -57,31 +58,49 @@ def parse(logs_path): def key(access): return f'{access.address} / {access.useragent}' -def group(accesses): - groups = {} - for k, g in groupby(sorted(accesses, key=key), key=key): - groups[k] = tuple( - (a.time, a.resource, a.referer) - for a in g - ) - return groups +def visits(accesses): + # Map (IP, user agent) to list of visits. A visit is a list of + # accesses. When processing an access, if the previous time for + # this (IP, user agent) is less than VISIT_MAX_DURATION seconds + # ago, we aggregate it, otherwise, we start a new visit. + visits = defaultdict(list) + + for access in accesses: + visitor = key(access) + + if visitor in visits: + last_access = visits[visitor][-1][-1].time + + if access.time - last_access < VISIT_MAX_DURATION: + visits[visitor][-1].append(access) + continue + + visits[visitor].append([access]) + + return visits + +def order(grouped_visits): + # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. + visits = {} -def span(visits): - return (visits[0][0], - visits[-1][0]-visits[0][0]) + for i, i_visits in grouped_visits.items(): + for v in i_visits: + visits[(i, v[0].time)] = v -def dump(logs_path): - for k, accesses in group(parse(logs_path)).items(): - print(k) + return visits - t, duration = span(accesses) - print(t, duration) +def analyze(logs_path): + accesses = parse(logs_path) + visits_by_visitor = visits(accesses) + visits_by_time = order(visits_by_visitor) - for (_, rsrc, ref) in accesses: - print(f'{rsrc} {ref}') + print('Visiteurs :', len(visits_by_visitor)) + print('Visites :', len(visits_by_time)) - print() + pagehits = Counter(a.resource for a in accesses) + for page, hits in pagehits.most_common(): + print(hits, page) if __name__ == '__main__': - dump(argv[1]) + analyze(argv[1]) -- cgit v1.2.3