diff options
| author | Kévin Le Gouguec <kevin.legouguec@gmail.com> | 2021-03-27 21:06:11 +0100 |
|---|---|---|
| committer | Kévin Le Gouguec <kevin.legouguec@gmail.com> | 2021-03-27 21:06:11 +0100 |
| commit | ecae0f066e83b596bc3a590baa1968fa1666ad19 (patch) | |
| tree | 3eecd9019ab300c13e8ed6b29f27f43b05df35d1 | |
| parent | df1c1afc80711b08927fd7968adaf4de5349df10 (diff) | |
| download | quatuorbellefeuille.com-ecae0f066e83b596bc3a590baa1968fa1666ad19.tar.xz | |
Add new script to process multiple log files
| -rwxr-xr-x | admin/stats/dump.py | 191 | ||||
| -rwxr-xr-x | admin/stats/old.py (renamed from admin/stats.py) | 0 | ||||
| -rwxr-xr-x | admin/stats/renamelogs.py | 35 |
3 files changed, 226 insertions, 0 deletions
diff --git a/admin/stats/dump.py b/admin/stats/dump.py new file mode 100755 index 0000000..7a54acc --- /dev/null +++ b/admin/stats/dump.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +from collections import Counter, defaultdict +import csv +from dataclasses import dataclass +from datetime import datetime, timedelta +from pathlib import Path +import re +from sys import argv +from urllib.parse import urlparse +from typing import Dict, List, Tuple +from warnings import warn + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P<address>\S+)', + r'\S+', + r'\S+', + r'(?P<date>\[\S+ \S+\])', + r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"(?P<referer>[^"]+)(\?\S+)?"', + r'"(?P<useragent>[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + +VISIT_MAX_DURATION = timedelta(hours=1) + +DOMAINS = { + 'quatuorbellefeuille.com', + 'quatuorbellefeuille.fr', + 'klg.uber.space' +} + + +def normalize_path(p): + if p == '/': + return '/index.html' + return p + + +@dataclass +class Access: + address: str + useragent: str + referrer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + resource = normalize_path(info['resource']) + + referrer = urlparse(info['referer']) + if referrer.netloc in DOMAINS: + ref = normalize_path(referrer.path) + else: + ref = referrer.netloc + + return cls( + info['address'], info['useragent'], ref, + datetime.strptime(info['date'], DATE_FMT), resource + ) + + +def interesting(resource): + return resource.endswith('.html') or resource == '/' + + +def parse(logs_paths): + logs = [] + for lp in logs_paths: + with open(lp) as logs_file: + logs += logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if (m is not None and interesting(m['resource'])) + ) + + +Visit = List[Access] + + +@dataclass +class Visitor: + address: str + useragent: str + referrers: List[str] + visits: List[Visit] + + +def useragent_kind(ua_string): + ua = user_agents.parse(ua_string) + if ua.is_pc: + return 'pc' + if ua.is_mobile: + return 'mobile' + if ua.is_tablet: + return 'tablet' + if ua.is_bot: + return 'bot' + warn(f'Unknown user agent kind: {ua_string}') + return 'n/a' + + +def sort_visits(accesses): + visitors: Dict[Tuple(str, str), Visitor] = {} + + for a in accesses: + key = (a.address, a.useragent) + + visitor = visitors.get(key) + if visitor is None: + visitor = Visitor( + a.address, + useragent_kind(a.useragent), + a.referrer, + [[a]] + ) + visitors[key] = visitor + continue + + last_visit = visitor.visits[-1] + last_access = last_visit[-1].time + if a.time - last_access < VISIT_MAX_DURATION: + last_visit.append(a) + continue + + visitor.visits.append([a]) + + return visitors + + +def find_days(visits): + return { + v[0].time.replace(hour=0, minute=0, second=0) + for v in visits + } + + +def daily_visitors(visitors, output_path): + days: Dict[datetime, Counter] = defaultdict(Counter) + columns = ('mobile', 'tablet', 'pc', 'bot', 'n/a') + + print('Visitors:') + + for v in visitors.values(): + for day in find_days(v.visits): + days[day][v.useragent] += 1 + + with open(output_path, 'w') as f: + out = csv.writer(f) + out.writerow(('day', 'total', *columns)) + print('day', 'total', *columns, sep='\t') + + for day in sorted(days): + counter = days[day] + counters = tuple(counter[c] for c in columns) + values = (day.strftime('%F'), sum(counters), *counters) + + out.writerow(values) + print(*values, sep='\t') + + +def daily_stats(visitors, output_dir): + output_dir = Path(output_dir) + daily_visitors(visitors, output_dir.joinpath('dailyvisitors.csv')) + # daily_visits(visitors, output_dir.joinpath('dailyvisits.csv')) + # daily_pages_per_visit(visitors, output_dir.joinpath('dailypagespervisit.csv')) + # daily_page_hits(visitors, output_dir.joinpath('dailypagehits.csv')) + # daily_referrers(visitors, output_dir.joinpath('dailyreferrers.csv')) + + +def global_stats(visitors, output_dir): + pass + + +def main(logs_paths, output_dir): + accesses = parse(logs_paths) + visitors = sort_visits(accesses) + daily_stats(visitors, output_dir) + global_stats(visitors, output_dir) + + +if __name__ == '__main__': + main(argv[1:-1], argv[-1]) diff --git a/admin/stats.py b/admin/stats/old.py index e3d46cc..e3d46cc 100755 --- a/admin/stats.py +++ b/admin/stats/old.py diff --git a/admin/stats/renamelogs.py b/admin/stats/renamelogs.py new file mode 100755 index 0000000..34fbccf --- /dev/null +++ b/admin/stats/renamelogs.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +from datetime import datetime +from os import rename +from pathlib import Path +import re +from sys import argv + + +ACCESS_RE = re.compile(' '.join(( + r'\S+', + r'\S+', + r'\S+', + r'\[(?P<date>[^:]+):\S+ \S+\]', + r'"GET [^ ?]+(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"[^"]+(\?\S+)?"', + r'"[^"]+"' +))) + +DATE_FMT = '%d/%b/%Y' + + +def main(paths): + for p in paths: + with open(p) as f: + date = ACCESS_RE.search(f.read()).group('date') + + date = datetime.strptime(date, DATE_FMT) + new_path = Path(p).with_name(date.strftime('%F')) + rename(p, new_path) + + +if __name__ == '__main__': + main(argv[1:]) |
