summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--admin/stats/util.py67
1 files changed, 43 insertions, 24 deletions
diff --git a/admin/stats/util.py b/admin/stats/util.py
index ced69d9..e79784c 100644
--- a/admin/stats/util.py
+++ b/admin/stats/util.py
@@ -1,6 +1,6 @@
+from collections import Counter, defaultdict
from dataclasses import dataclass
-from datetime import datetime
-from itertools import groupby
+from datetime import datetime, timedelta
import re
from sys import argv
@@ -12,7 +12,7 @@ ACCESS_RE = re.compile(' '.join((
r'\S+',
r'\S+',
r'(?P<date>\[\S+ \S+\])',
- r'"GET (?P<resource>\S+) [^"]+"',
+ r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
r'200 [0-9]+',
r'"(?P<referer>[^"]+)"',
r'"(?P<useragent>[^"]+)"'
@@ -20,6 +20,8 @@ ACCESS_RE = re.compile(' '.join((
DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
+VISIT_MAX_DURATION = timedelta(hours=1)
+
@dataclass
class Access:
@@ -41,7 +43,6 @@ def interesting(resource):
return (
resource.endswith('.html')
or resource == '/'
- or resource.startswith('/?')
)
def parse(logs_path):
@@ -57,31 +58,49 @@ def parse(logs_path):
def key(access):
return f'{access.address} / {access.useragent}'
-def group(accesses):
- groups = {}
- for k, g in groupby(sorted(accesses, key=key), key=key):
- groups[k] = tuple(
- (a.time, a.resource, a.referer)
- for a in g
- )
- return groups
+def visits(accesses):
+ # Map (IP, user agent) to list of visits. A visit is a list of
+ # accesses. When processing an access, if the previous time for
+ # this (IP, user agent) is less than VISIT_MAX_DURATION seconds
+ # ago, we aggregate it, otherwise, we start a new visit.
+ visits = defaultdict(list)
+
+ for access in accesses:
+ visitor = key(access)
+
+ if visitor in visits:
+ last_access = visits[visitor][-1][-1].time
+
+ if access.time - last_access < VISIT_MAX_DURATION:
+ visits[visitor][-1].append(access)
+ continue
+
+ visits[visitor].append([access])
+
+ return visits
+
+def order(grouped_visits):
+ # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
+ visits = {}
-def span(visits):
- return (visits[0][0],
- visits[-1][0]-visits[0][0])
+ for i, i_visits in grouped_visits.items():
+ for v in i_visits:
+ visits[(i, v[0].time)] = v
-def dump(logs_path):
- for k, accesses in group(parse(logs_path)).items():
- print(k)
+ return visits
- t, duration = span(accesses)
- print(t, duration)
+def analyze(logs_path):
+ accesses = parse(logs_path)
+ visits_by_visitor = visits(accesses)
+ visits_by_time = order(visits_by_visitor)
- for (_, rsrc, ref) in accesses:
- print(f'{rsrc} {ref}')
+ print('Visiteurs :', len(visits_by_visitor))
+ print('Visites :', len(visits_by_time))
- print()
+ pagehits = Counter(a.resource for a in accesses)
+ for page, hits in pagehits.most_common():
+ print(hits, page)
if __name__ == '__main__':
- dump(argv[1])
+ analyze(argv[1])