from collections import Counter, defaultdict from dataclasses import dataclass from datetime import datetime, timedelta import re from sys import argv import user_agents ACCESS_RE = re.compile(' '.join(( r'(?P
\S+)', r'\S+', r'\S+', r'(?P\[\S+ \S+\])', r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', r'200 [0-9]+', r'"(?P[^"]+)"', r'"(?P[^"]+)"' ))) DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' VISIT_MAX_DURATION = timedelta(hours=1) @dataclass class Access: address: str useragent: str referer: str time: datetime resource: str @classmethod def from_log(cls, info): return cls( info['address'], user_agents.parse(info['useragent']), info['referer'], datetime.strptime(info['date'], DATE_FMT), info['resource'] ) def interesting(resource): return ( resource.endswith('.html') or resource == '/' ) def parse(logs_path): with open(logs_path) as logs_file: logs = logs_file.read().splitlines() matches = (ACCESS_RE.match(l) for l in logs) return tuple( Access.from_log(m) for m in matches if m is not None and interesting(m['resource']) ) def key(access): return f'{access.address} / {access.useragent}' def visits(accesses): # Map (IP, user agent) to list of visits. A visit is a list of # accesses. When processing an access, if the previous time for # this (IP, user agent) is less than VISIT_MAX_DURATION seconds # ago, we aggregate it, otherwise, we start a new visit. visits = defaultdict(list) for access in accesses: visitor = key(access) if visitor in visits: last_access = visits[visitor][-1][-1].time if access.time - last_access < VISIT_MAX_DURATION: visits[visitor][-1].append(access) continue visits[visitor].append([access]) return visits def order(grouped_visits): # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. visits = {} for i, i_visits in grouped_visits.items(): for v in i_visits: visits[(i, v[0].time)] = v return visits def analyze(logs_path): accesses = parse(logs_path) visits_by_visitor = visits(accesses) visits_by_time = order(visits_by_visitor) print('Visiteurs :', len(visits_by_visitor)) print('Visites :', len(visits_by_time)) pagehits = Counter(a.resource for a in accesses) for page, hits in pagehits.most_common(): print(hits, page) if __name__ == '__main__': analyze(argv[1])