from collections import Counter, defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
import re
from sys import argv
import user_agents
ACCESS_RE = re.compile(' '.join((
r'(?P
\S+)',
r'\S+',
r'\S+',
r'(?P\[\S+ \S+\])',
r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"',
r'200 [0-9]+',
r'"(?P[^"]+)"',
r'"(?P[^"]+)"'
)))
DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
VISIT_MAX_DURATION = timedelta(hours=1)
@dataclass
class Access:
address: str
useragent: str
referer: str
time: datetime
resource: str
@classmethod
def from_log(cls, info):
return cls(
info['address'], user_agents.parse(info['useragent']),
info['referer'], datetime.strptime(info['date'], DATE_FMT),
info['resource']
)
def interesting(resource):
return (
resource.endswith('.html')
or resource == '/'
)
def parse(logs_path):
with open(logs_path) as logs_file:
logs = logs_file.read().splitlines()
matches = (ACCESS_RE.match(l) for l in logs)
return tuple(
Access.from_log(m) for m in matches
if m is not None and interesting(m['resource'])
)
def key(access):
return f'{access.address} / {access.useragent}'
def visits(accesses):
# Map (IP, user agent) to list of visits. A visit is a list of
# accesses. When processing an access, if the previous time for
# this (IP, user agent) is less than VISIT_MAX_DURATION seconds
# ago, we aggregate it, otherwise, we start a new visit.
visits = defaultdict(list)
for access in accesses:
visitor = key(access)
if visitor in visits:
last_access = visits[visitor][-1][-1].time
if access.time - last_access < VISIT_MAX_DURATION:
visits[visitor][-1].append(access)
continue
visits[visitor].append([access])
return visits
def order(grouped_visits):
# Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
visits = {}
for i, i_visits in grouped_visits.items():
for v in i_visits:
visits[(i, v[0].time)] = v
return visits
def analyze(logs_path):
accesses = parse(logs_path)
visits_by_visitor = visits(accesses)
visits_by_time = order(visits_by_visitor)
print('Visiteurs :', len(visits_by_visitor))
print('Visites :', len(visits_by_time))
pagehits = Counter(a.resource for a in accesses)
for page, hits in pagehits.most_common():
print(hits, page)
if __name__ == '__main__':
analyze(argv[1])