#!/usr/bin/env python3
from collections import Counter, defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
import re
from os import remove
from subprocess import run
from sys import argv
import user_agents
ACCESS_RE = re.compile(' '.join((
r'(?P
\S+)',
r'\S+',
r'\S+',
r'(?P\[\S+ \S+\])',
r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"',
r'200 [0-9]+',
r'"(?P[^"]+)"',
r'"(?P[^"]+)"'
)))
DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
VISIT_MAX_DURATION = timedelta(hours=1)
@dataclass
class Access:
address: str
useragent: str
referer: str
time: datetime
resource: str
@classmethod
def from_log(cls, info):
resource = info['resource']
if resource == '/':
resource = '/index.html'
referer = re.sub(
r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?',
r'\1',
info['referer']
)
if referer == '/':
referer = '/index.html'
return cls(
info['address'], user_agents.parse(info['useragent']),
referer, datetime.strptime(info['date'], DATE_FMT),
resource
)
def interesting(resource):
return (
resource.endswith('.html')
or resource == '/'
)
def parse(logs_path):
with open(logs_path) as logs_file:
logs = logs_file.read().splitlines()
matches = (ACCESS_RE.match(l) for l in logs)
return tuple(
Access.from_log(m) for m in matches
if (m is not None
and interesting(m['resource'])
and 'klg.uber.space' not in m['referer'])
)
def key(access):
return f'{access.address} / {access.useragent}'
def visits(accesses):
# Map (IP, user agent) to list of visits. A visit is a list of
# accesses. When processing an access, if the previous time for
# this (IP, user agent) is less than VISIT_MAX_DURATION seconds
# ago, we aggregate it, otherwise, we start a new visit.
visits = defaultdict(list)
for access in accesses:
visitor = key(access)
if visitor in visits:
last_access = visits[visitor][-1][-1].time
if access.time - last_access < VISIT_MAX_DURATION:
visits[visitor][-1].append(access)
continue
visits[visitor].append([access])
return visits
def order(grouped_visits):
# Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
visits = {}
for i, i_visits in grouped_visits.items():
for v in i_visits:
visits[(i, v[0].time)] = v
return visits
def visit_graph(accesses):
edges = (f' "{a.referer}" -> "{a.resource}";'
for a in accesses)
return '\n'.join((f'digraph visit {{', *edges, '}'))
def graph(visits):
date = visits[0][0].time.strftime('%F')
tempfiles = {
f'{date}-{i}.pdf': visit for i, visit in enumerate(visits)
}
for tempfile, visit in tempfiles.items():
vgraph = visit_graph(visit)
with open(tempfile, 'wb') as vfile:
vfile.write(
run(('dot', '-Tpdf'), text=False, check=True,
capture_output=True, input=vgraph.encode())
.stdout
)
run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'),
check=True)
for f in tempfiles:
remove(f)
def analyze(logs_path):
accesses = parse(logs_path)
visits_by_visitor = visits(accesses)
visits_by_time = order(visits_by_visitor)
print('Visiteurs :', len(visits_by_visitor))
print('Visites :', len(visits_by_time))
pagehits = Counter(a.resource for a in accesses)
for page, hits in pagehits.most_common():
print(hits, page)
graph(tuple(visits_by_time.values()))
if __name__ == '__main__':
analyze(argv[1])