diff options
Diffstat (limited to 'admin/stats.py')
| -rwxr-xr-x | admin/stats.py | 153 |
1 files changed, 0 insertions, 153 deletions
diff --git a/admin/stats.py b/admin/stats.py deleted file mode 100755 index e3d46cc..0000000 --- a/admin/stats.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 - -from collections import Counter, defaultdict -from dataclasses import dataclass -from datetime import datetime, timedelta -import re -from os import remove -from subprocess import run -from sys import argv - -import user_agents - - -ACCESS_RE = re.compile(' '.join(( - r'(?P<address>\S+)', - r'\S+', - r'\S+', - r'(?P<date>\[\S+ \S+\])', - r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', - r'200 [0-9]+', - r'"(?P<referer>[^"]+)"', - r'"(?P<useragent>[^"]+)"' -))) - -DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' - -VISIT_MAX_DURATION = timedelta(hours=1) - - -@dataclass -class Access: - address: str - useragent: str - referer: str - time: datetime - resource: str - - @classmethod - def from_log(cls, info): - resource = info['resource'] - if resource == '/': - resource = '/index.html' - - referer = re.sub( - r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?', - r'\1', - info['referer'] - ) - if referer == '/': - referer = '/index.html' - - return cls( - info['address'], user_agents.parse(info['useragent']), - referer, datetime.strptime(info['date'], DATE_FMT), - resource - ) - -def interesting(resource): - return ( - resource.endswith('.html') - or resource == '/' - ) - -def parse(logs_path): - with open(logs_path) as logs_file: - logs = logs_file.read().splitlines() - - matches = (ACCESS_RE.match(l) for l in logs) - return tuple( - Access.from_log(m) for m in matches - if (m is not None - and interesting(m['resource']) - and 'klg.uber.space' not in m['referer']) - ) - -def key(access): - return f'{access.address} / {access.useragent}' - -def visits(accesses): - # Map (IP, user agent) to list of visits. A visit is a list of - # accesses. When processing an access, if the previous time for - # this (IP, user agent) is less than VISIT_MAX_DURATION seconds - # ago, we aggregate it, otherwise, we start a new visit. - visits = defaultdict(list) - - for access in accesses: - visitor = key(access) - - if visitor in visits: - last_access = visits[visitor][-1][-1].time - - if access.time - last_access < VISIT_MAX_DURATION: - visits[visitor][-1].append(access) - continue - - visits[visitor].append([access]) - - return visits - -def order(grouped_visits): - # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. - visits = {} - - for i, i_visits in grouped_visits.items(): - for v in i_visits: - visits[(i, v[0].time)] = v - - return visits - -def visit_graph(accesses): - edges = (f' "{a.referer}" -> "{a.resource}";' - for a in accesses) - return '\n'.join((f'digraph visit {{', *edges, '}')) - -def graph(visits): - date = visits[0][0].time.strftime('%F') - - tempfiles = { - f'{date}-{i}.pdf': visit for i, visit in enumerate(visits) - } - - for tempfile, visit in tempfiles.items(): - vgraph = visit_graph(visit) - - with open(tempfile, 'wb') as vfile: - vfile.write( - run(('dot', '-Tpdf'), text=False, check=True, - capture_output=True, input=vgraph.encode()) - .stdout - ) - - run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'), - check=True) - - for f in tempfiles: - remove(f) - -def analyze(logs_path): - accesses = parse(logs_path) - visits_by_visitor = visits(accesses) - visits_by_time = order(visits_by_visitor) - - print('Visiteurs :', len(visits_by_visitor)) - print('Visites :', len(visits_by_time)) - - pagehits = Counter(a.resource for a in accesses) - for page, hits in pagehits.most_common(): - print(hits, page) - - graph(tuple(visits_by_time.values())) - -if __name__ == '__main__': - analyze(argv[1]) |
