summaryrefslogtreecommitdiff
path: root/admin/stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'admin/stats.py')
-rwxr-xr-xadmin/stats.py153
1 files changed, 0 insertions, 153 deletions
diff --git a/admin/stats.py b/admin/stats.py
deleted file mode 100755
index e3d46cc..0000000
--- a/admin/stats.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-
-from collections import Counter, defaultdict
-from dataclasses import dataclass
-from datetime import datetime, timedelta
-import re
-from os import remove
-from subprocess import run
-from sys import argv
-
-import user_agents
-
-
-ACCESS_RE = re.compile(' '.join((
- r'(?P<address>\S+)',
- r'\S+',
- r'\S+',
- r'(?P<date>\[\S+ \S+\])',
- r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
- r'200 [0-9]+',
- r'"(?P<referer>[^"]+)"',
- r'"(?P<useragent>[^"]+)"'
-)))
-
-DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
-
-VISIT_MAX_DURATION = timedelta(hours=1)
-
-
-@dataclass
-class Access:
- address: str
- useragent: str
- referer: str
- time: datetime
- resource: str
-
- @classmethod
- def from_log(cls, info):
- resource = info['resource']
- if resource == '/':
- resource = '/index.html'
-
- referer = re.sub(
- r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?',
- r'\1',
- info['referer']
- )
- if referer == '/':
- referer = '/index.html'
-
- return cls(
- info['address'], user_agents.parse(info['useragent']),
- referer, datetime.strptime(info['date'], DATE_FMT),
- resource
- )
-
-def interesting(resource):
- return (
- resource.endswith('.html')
- or resource == '/'
- )
-
-def parse(logs_path):
- with open(logs_path) as logs_file:
- logs = logs_file.read().splitlines()
-
- matches = (ACCESS_RE.match(l) for l in logs)
- return tuple(
- Access.from_log(m) for m in matches
- if (m is not None
- and interesting(m['resource'])
- and 'klg.uber.space' not in m['referer'])
- )
-
-def key(access):
- return f'{access.address} / {access.useragent}'
-
-def visits(accesses):
- # Map (IP, user agent) to list of visits. A visit is a list of
- # accesses. When processing an access, if the previous time for
- # this (IP, user agent) is less than VISIT_MAX_DURATION seconds
- # ago, we aggregate it, otherwise, we start a new visit.
- visits = defaultdict(list)
-
- for access in accesses:
- visitor = key(access)
-
- if visitor in visits:
- last_access = visits[visitor][-1][-1].time
-
- if access.time - last_access < VISIT_MAX_DURATION:
- visits[visitor][-1].append(access)
- continue
-
- visits[visitor].append([access])
-
- return visits
-
-def order(grouped_visits):
- # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
- visits = {}
-
- for i, i_visits in grouped_visits.items():
- for v in i_visits:
- visits[(i, v[0].time)] = v
-
- return visits
-
-def visit_graph(accesses):
- edges = (f' "{a.referer}" -> "{a.resource}";'
- for a in accesses)
- return '\n'.join((f'digraph visit {{', *edges, '}'))
-
-def graph(visits):
- date = visits[0][0].time.strftime('%F')
-
- tempfiles = {
- f'{date}-{i}.pdf': visit for i, visit in enumerate(visits)
- }
-
- for tempfile, visit in tempfiles.items():
- vgraph = visit_graph(visit)
-
- with open(tempfile, 'wb') as vfile:
- vfile.write(
- run(('dot', '-Tpdf'), text=False, check=True,
- capture_output=True, input=vgraph.encode())
- .stdout
- )
-
- run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'),
- check=True)
-
- for f in tempfiles:
- remove(f)
-
-def analyze(logs_path):
- accesses = parse(logs_path)
- visits_by_visitor = visits(accesses)
- visits_by_time = order(visits_by_visitor)
-
- print('Visiteurs :', len(visits_by_visitor))
- print('Visites :', len(visits_by_time))
-
- pagehits = Counter(a.resource for a in accesses)
- for page, hits in pagehits.most_common():
- print(hits, page)
-
- graph(tuple(visits_by_time.values()))
-
-if __name__ == '__main__':
- analyze(argv[1])