diff options
Diffstat (limited to 'admin/stats/util.py')
| -rw-r--r-- | admin/stats/util.py | 67 |
1 files changed, 43 insertions, 24 deletions
diff --git a/admin/stats/util.py b/admin/stats/util.py index ced69d9..e79784c 100644 --- a/admin/stats/util.py +++ b/admin/stats/util.py @@ -1,6 +1,6 @@ +from collections import Counter, defaultdict from dataclasses import dataclass -from datetime import datetime -from itertools import groupby +from datetime import datetime, timedelta import re from sys import argv @@ -12,7 +12,7 @@ ACCESS_RE = re.compile(' '.join(( r'\S+', r'\S+', r'(?P<date>\[\S+ \S+\])', - r'"GET (?P<resource>\S+) [^"]+"', + r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', r'200 [0-9]+', r'"(?P<referer>[^"]+)"', r'"(?P<useragent>[^"]+)"' @@ -20,6 +20,8 @@ ACCESS_RE = re.compile(' '.join(( DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' +VISIT_MAX_DURATION = timedelta(hours=1) + @dataclass class Access: @@ -41,7 +43,6 @@ def interesting(resource): return ( resource.endswith('.html') or resource == '/' - or resource.startswith('/?') ) def parse(logs_path): @@ -57,31 +58,49 @@ def parse(logs_path): def key(access): return f'{access.address} / {access.useragent}' -def group(accesses): - groups = {} - for k, g in groupby(sorted(accesses, key=key), key=key): - groups[k] = tuple( - (a.time, a.resource, a.referer) - for a in g - ) - return groups +def visits(accesses): + # Map (IP, user agent) to list of visits. A visit is a list of + # accesses. When processing an access, if the previous time for + # this (IP, user agent) is less than VISIT_MAX_DURATION seconds + # ago, we aggregate it, otherwise, we start a new visit. + visits = defaultdict(list) + + for access in accesses: + visitor = key(access) + + if visitor in visits: + last_access = visits[visitor][-1][-1].time + + if access.time - last_access < VISIT_MAX_DURATION: + visits[visitor][-1].append(access) + continue + + visits[visitor].append([access]) + + return visits + +def order(grouped_visits): + # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. + visits = {} -def span(visits): - return (visits[0][0], - visits[-1][0]-visits[0][0]) + for i, i_visits in grouped_visits.items(): + for v in i_visits: + visits[(i, v[0].time)] = v -def dump(logs_path): - for k, accesses in group(parse(logs_path)).items(): - print(k) + return visits - t, duration = span(accesses) - print(t, duration) +def analyze(logs_path): + accesses = parse(logs_path) + visits_by_visitor = visits(accesses) + visits_by_time = order(visits_by_visitor) - for (_, rsrc, ref) in accesses: - print(f'{rsrc} {ref}') + print('Visiteurs :', len(visits_by_visitor)) + print('Visites :', len(visits_by_time)) - print() + pagehits = Counter(a.resource for a in accesses) + for page, hits in pagehits.most_common(): + print(hits, page) if __name__ == '__main__': - dump(argv[1]) + analyze(argv[1]) |
