#!/usr/bin/python3 import re, os, gzip from datetime import datetime from pathlib import Path from dataclasses import dataclass from jinja2 import Environment from time import time LINE_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"([A-Z]+) ([^ ]+) HTTP/\d\.\d\"" TEMPLATE=""" Most visited pages

Stats

Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago)

Stats generated in {{ seconds }}s

Total visits: {{ total_visits }}

Unique visits: {{ unique_visits }}

Most visited url: {{ most_visited_site }}

""" @dataclass class Line: source: str dest: str time: str kind: str resource: str start_time = time() logs_dir = Path(os.environ.get("LOGS_DIR", "/var/log/lighttpd")) log_files = [log for log in logs_dir.iterdir() if log.is_file() and log.name.startswith("access.log")] urls = {} seen = set() total_visits = 0 for file in log_files: if file.suffix == ".gz": reader = gzip.open(file, "rt") else: reader = file.open() for line in reader: if (match := re.match(LINE_PATTERN, line)): line = Line( match.group(1), match.group(2), match.group(3), match.group(4), match.group(5) ) if line.kind == "GET": try: count = urls[line.resource] except KeyError: count = 0 urls[line.resource] = count + 1 total_visits += 1 seen.add(line.source) first_log_file = log_files[0] logs_start_date = datetime.fromtimestamp(first_log_file.stat().st_ctime) logs_days_ago = (datetime.now() - logs_start_date).days unique_visits = len(seen) most_visited_site = sorted(urls.items(), key=lambda item: item[1])[-1][0] seconds = round(time() - start_time, 3) environment = Environment() template = environment.from_string(TEMPLATE) print(template.render( seconds=seconds, logs_start_date=logs_start_date.ctime(), logs_days_ago=logs_days_ago, total_visits=total_visits, unique_visits=unique_visits, most_visited_site=most_visited_site ))