1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
from collections import Counter, defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
import re
from subprocess import run
from sys import argv
import user_agents
ACCESS_RE = re.compile(' '.join((
r'(?P<address>\S+)',
r'\S+',
r'\S+',
r'(?P<date>\[\S+ \S+\])',
r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
r'200 [0-9]+',
r'"(?P<referer>[^"]+)"',
r'"(?P<useragent>[^"]+)"'
)))
DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
VISIT_MAX_DURATION = timedelta(hours=1)
@dataclass
class Access:
address: str
useragent: str
referer: str
time: datetime
resource: str
@classmethod
def from_log(cls, info):
resource = info['resource']
if resource == '/':
resource = '/index.html'
return cls(
info['address'], user_agents.parse(info['useragent']),
info['referer'], datetime.strptime(info['date'], DATE_FMT),
resource
)
def interesting(resource):
return (
resource.endswith('.html')
or resource == '/'
)
def parse(logs_path):
with open(logs_path) as logs_file:
logs = logs_file.read().splitlines()
matches = (ACCESS_RE.match(l) for l in logs)
return tuple(
Access.from_log(m) for m in matches
if (m is not None
and interesting(m['resource'])
and 'klg.uber.space' not in m['referer'])
)
def key(access):
return f'{access.address} / {access.useragent}'
def visits(accesses):
# Map (IP, user agent) to list of visits. A visit is a list of
# accesses. When processing an access, if the previous time for
# this (IP, user agent) is less than VISIT_MAX_DURATION seconds
# ago, we aggregate it, otherwise, we start a new visit.
visits = defaultdict(list)
for access in accesses:
visitor = key(access)
if visitor in visits:
last_access = visits[visitor][-1][-1].time
if access.time - last_access < VISIT_MAX_DURATION:
visits[visitor][-1].append(access)
continue
visits[visitor].append([access])
return visits
def order(grouped_visits):
# Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
visits = {}
for i, i_visits in grouped_visits.items():
for v in i_visits:
visits[(i, v[0].time)] = v
return visits
def normalizeref(referer):
if referer == '-':
return 'inconnu'
referer = re.sub(
r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?',
r'\1', referer
)
if referer == '/':
return '/index.html'
return referer
def analyze(logs_path):
accesses = parse(logs_path)
visits_by_visitor = visits(accesses)
visits_by_time = order(visits_by_visitor)
print('Visiteurs :', len(visits_by_visitor))
print('Visites :', len(visits_by_time))
pagehits = Counter(a.resource for a in accesses)
for page, hits in pagehits.most_common():
print(hits, page)
date = accesses[0].time.strftime('%F')
for i, visit in enumerate(visits_by_time.values()):
edges = (f' "{normalizeref(a.referer)}" -- "{a.resource}";'
for a in visit)
graph = '\n'.join(('graph trip {',
*edges,
'}'))
dot = run(
('dot', '-Tpdf'), text=False, capture_output=True, check=True,
input=graph.encode()
)
with open(f'{date}-{i}.pdf', 'wb') as visitgraph:
visitgraph.write(dot.stdout)
pages = (f'{date}-{i}.pdf' for i in range(len(visits_by_time.values())))
run(('qpdf', '--empty', '--pages', *pages, '--', f'{date}.pdf'),
text=False, check=True)
if __name__ == '__main__':
analyze(argv[1])
|