diff options
author | John Turner <jturner.usa@gmail.com> | 2025-07-15 23:01:13 -0400 |
---|---|---|
committer | John Turner <jturner.usa@gmail.com> | 2025-07-15 23:01:13 -0400 |
commit | 8d5fdbd8606e3a1c64286fe6cd2b70b18b5d392c (patch) | |
tree | 497f26e36a11cdd912d24bb30f9c6c5c60fa54db | |
parent | b2ad5e2050e7b99e3ae5cade74c9559d3da2bdfc (diff) | |
download | website-8d5fdbd8606e3a1c64286fe6cd2b70b18b5d392c.tar.gz |
process line-by-line and add css styling to template
-rwxr-xr-x | stats.py | 114 |
1 files changed, 63 insertions, 51 deletions
@@ -7,44 +7,53 @@ from dataclasses import dataclass from jinja2 import Environment from time import time -GET_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"GET ([^ ]*)" +LINE_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"([A-Z]+) ([^ ]+) HTTP/\d\.\d\"" TEMPLATE=""" <!DOCTYPE html> <html lang="en"> -<head> - <meta charset="utf-8"> - <title>Most visited pages</title> -</head> -<body> - <h1>Stats</h1> - <p> - Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago) - </p> - <p> - Stats generated in {{ seconds }}s - </p> - <p> - Total visits: {{ total_visits }} - </p> - <p> - Unique visits: {{ unique_visits }} - </p> - <p> - Most visited url: {{ most_visited_site }} - </p> -</body> - + <head> + <meta charset="utf-8"> + <title>Most visited pages</title> + <style type="text/css"> + .stats { + text-align: center + } + </style> + </head> + + <body> + <div class="stats"> + <h1>Stats</h1> + <p> + Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago) + </p> + <p> + Stats generated in {{ seconds }}s + </p> + <p> + Total visits: {{ total_visits }} + </p> + <p> + Unique visits: {{ unique_visits }} + </p> + <p> + Most visited url: {{ most_visited_site }} + </p> + </div> + </body> </html> """ @dataclass -class Get: +class Line: source: str dest: str - url: str + time: str + kind: str + resource: str start_time = time() @@ -54,44 +63,47 @@ log_files = [log for log in logs_dir.iterdir() if log.is_file() and log.name.startswith("access.log")] -gets = [] +urls = {} +seen = set() +total_visits = 0 for file in log_files: if file.suffix == ".gz": - decoder = gzip.open(file) - text = decoder.read().decode() + reader = gzip.open(file, "rt") else: - text = file.read_text() - - for match in re.finditer(GET_PATTERN, text, flags=re.MULTILINE): - get = Get(match.group(1), match.group(2), match.group(4)) - gets.append(get) - -urls = {} - -for get in gets: - try: - count = urls[get.url] - except KeyError: - count = 0 - - urls[get.url] = count + 1 + reader = file.open() + + for line in reader: + if (match := re.match(LINE_PATTERN, line)): + line = Line( + match.group(1), + match.group(2), + match.group(3), + match.group(4), + match.group(5) + ) + + if line.kind == "GET": + try: + count = urls[line.resource] + except KeyError: + count = 0 + + urls[line.resource] = count + 1 + total_visits += 1 + seen.add(line.source) + first_log_file = log_files[0] logs_start_date = datetime.fromtimestamp(first_log_file.stat().st_ctime) logs_days_ago = (datetime.now() - logs_start_date).days - -total_visits = len(gets) -unique_visits = len(set(get.source for get in gets)) +unique_visits = len(seen) most_visited_site = sorted(urls.items(), key=lambda item: item[1])[-1][0] +seconds = round(time() - start_time, 3) environment = Environment() template = environment.from_string(TEMPLATE) -seconds = round(time() - start_time, 3) - -print("Content-Type: text/html\r\n") - print(template.render( seconds=seconds, logs_start_date=logs_start_date.ctime(), |