diff options
| author | Kévin Le Gouguec <kevin.legouguec@gmail.com> | 2021-03-28 19:16:36 +0200 |
|---|---|---|
| committer | Kévin Le Gouguec <kevin.legouguec@gmail.com> | 2021-03-28 19:16:36 +0200 |
| commit | a65d69ce7222ce65f66c1c20ac3febf8298fec33 (patch) | |
| tree | 115de6169400e47112ff783e9891d6e61d23909d /admin | |
| parent | 2ac0e2bf6bc2abebba611147ec994da5b6c06e10 (diff) | |
| download | quatuorbellefeuille.com-a65d69ce7222ce65f66c1c20ac3febf8298fec33.tar.xz | |
Add daily referrer counts
Diffstat (limited to 'admin')
| -rwxr-xr-x | admin/stats/dump.py | 66 |
1 files changed, 57 insertions, 9 deletions
diff --git a/admin/stats/dump.py b/admin/stats/dump.py index ac412c9..ce7d5b2 100755 --- a/admin/stats/dump.py +++ b/admin/stats/dump.py @@ -11,7 +11,7 @@ import re from statistics import mean, median, stdev from sys import argv from urllib.parse import unquote, urlparse -from typing import Dict, List, Tuple +from typing import Dict, List, Set, Tuple import user_agents @@ -31,15 +31,15 @@ DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' VISIT_MAX_DURATION = timedelta(hours=1) -DOMAINS = { +DOMAINS = ( 'quatuorbellefeuille.com', 'quatuorbellefeuille.fr', 'klg.uber.space' -} +) def normalize_path(p): - if p == '/': + if p in ('', '/'): return '/index.html' return unquote(p) @@ -57,10 +57,12 @@ class Access: resource = normalize_path(info['resource']) referrer = urlparse(info['referer']) - if referrer.netloc in DOMAINS: + if referrer.netloc.endswith(DOMAINS): ref = normalize_path(referrer.path) - else: + elif referrer.netloc: ref = referrer.netloc + else: + ref = 'n/a' return cls( info['address'], info['useragent'], ref, @@ -122,7 +124,6 @@ class UserAgentKind(Enum): class Visitor: address: str useragent: UserAgentKind - referrers: List[str] visits: List[Visit] @@ -137,7 +138,6 @@ def sort_visits(accesses): visitor = Visitor( a.address, UserAgentKind.from_useragent(a.useragent), - a.referrer, [[a]] ) visitors[key] = visitor @@ -175,6 +175,26 @@ def find_pages(visitors): }) +def external_referrer(ref): + return ref != 'n/a' and not ref.startswith('/') + + +def simplify_referrer(ref): + parts = ref.split('.') + # Remove leading parts (www., l., m.…) and extension (.com, .fr…). + return parts[-2] + + +def find_referrers(visitors): + return sorted({ + simplify_referrer(access.referrer) + for v in visitors + for visit in v.visits + for access in visit + if external_referrer(access.referrer) + }) + + def daily_visitors(visitors, output_path): days: Dict[datetime, Counter] = defaultdict(Counter) columns = ('mobile', 'tablet', 'pc', 'bot', 'n/a') @@ -288,13 +308,41 @@ def daily_page_hits(visitors, output_path): print(hits, page, sep='\t') +def daily_referrers(visitors, output_path): + days: Dict[datetime, Counter] = defaultdict(Counter) + columns = find_referrers(visitors.values()) + + print('Referrers:') + + for v in visitors.values(): + for visit in v.visits: + day = datetime_day(visit[0].time) + + for access in visit: + if not external_referrer(access.referrer): + continue + days[day][simplify_referrer(access.referrer)] += 1 + + with open(output_path, 'w') as f: + out = csv.writer(f) + out.writerow(('day', *columns)) + print('day', *columns, sep='\t') + + for day in sorted(days): + refcounts = days[day] + values = (day.strftime('%F'), *(refcounts[ref] for ref in columns)) + + out.writerow(values) + print(*values, sep='\t') + + def daily_stats(visitors, output_dir): output_dir = Path(output_dir) daily_visitors(visitors, output_dir.joinpath('dailyvisitors.csv')) daily_visits(visitors, output_dir.joinpath('dailyvisits.csv')) daily_pages_per_visit(visitors, output_dir.joinpath('dailypagespervisit.csv')) daily_page_hits(visitors, output_dir.joinpath('dailypagehits.csv')) - # daily_referrers(visitors, output_dir.joinpath('dailyreferrers.csv')) + daily_referrers(visitors, output_dir.joinpath('dailyreferrers.csv')) def global_stats(visitors, output_dir): |
