]> jturnerusa.dev Git - website/commitdiff
stats.py: only count visits that were sucessful
authorJohn Turner <jturner.usa@gmail.com>
Thu, 17 Jul 2025 19:59:54 +0000 (15:59 -0400)
committerJohn Turner <jturner.usa@gmail.com>
Thu, 17 Jul 2025 20:06:21 +0000 (16:06 -0400)
stats.py

index 824bfa4e0e30dde2b98b277db21b2a295cc0b667..448beca5b61e57ae790bf2bca3fe593475b2d3a9 100755 (executable)
--- a/stats.py
+++ b/stats.py
@@ -1,15 +1,20 @@
 #!/usr/bin/python3
 
-import re, os, gzip
+import re
+import os
+import gzip
 from datetime import datetime
 from pathlib import Path
 from dataclasses import dataclass
 from jinja2 import Environment
 from time import time
 
-LINE_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"([A-Z]+) ([^ ]+) HTTP/\d\.\d\""
+LINE_PATTERN = (
+    r"^([^ ]+) ([^ ]+) - \[(.*)\] \"([A-Z]+) ([^ ]+) (HTTP/\d\.\d)\""
+    r" ([0-9]+) ([0-9]+) \"-\" \"([^\"]+)\""
+)
 
-TEMPLATE="""
+TEMPLATE = """
 <!DOCTYPE html>
 
 <html lang="en">
@@ -47,6 +52,7 @@ TEMPLATE="""
 </html>
 """
 
+
 @dataclass
 class Line:
     source: str
@@ -54,17 +60,24 @@ class Line:
     time: str
     kind: str
     resource: str
+    version: str
+    code: int
+    size: int
+    agent: str
+
 
 start_time = time()
 
 logs_dir = Path(os.environ.get("LOGS_DIR", "/var/log/lighttpd"))
 
-log_files = [log for log in logs_dir.iterdir()
-             if log.is_file()
-             and log.name.startswith("access.log")]
+log_files = [
+    log
+    for log in logs_dir.iterdir()
+    if log.is_file() and log.name.startswith("access.log")
+]
 
-urls = {}
-seen = set()
+urls: dict[str, int] = {}
+seen: set[str] = set()
 total_visits = 0
 
 for file in log_files:
@@ -74,25 +87,28 @@ for file in log_files:
         reader = file.open()
 
     for line in reader:
-        if (match := re.match(LINE_PATTERN, line)):
-            line = Line(
+        if match := re.match(LINE_PATTERN, line):
+            entry = Line(
                 match.group(1),
                 match.group(2),
                 match.group(3),
                 match.group(4),
-                match.group(5)
+                match.group(5),
+                match.group(6),
+                int(match.group(7)),
+                int(match.group(8)),
+                match.group(9),
             )
 
-            if line.kind == "GET":
+            if entry.kind == "GET" and entry.code == 200:
                 try:
-                    count = urls[line.resource]
+                    count = urls[entry.resource]
                 except KeyError:
                     count = 0
-                    
-                urls[line.resource] = count + 1
+
+                urls[entry.resource] = count + 1
                 total_visits += 1
-                seen.add(line.source)
-        
+                seen.add(entry.source)
 
 first_log_file = log_files[0]
 logs_start_date = datetime.fromtimestamp(first_log_file.stat().st_ctime)
@@ -104,11 +120,13 @@ seconds = round(time() - start_time, 3)
 environment = Environment()
 template = environment.from_string(TEMPLATE)
 
-print(template.render(
-    seconds=seconds,
-    logs_start_date=logs_start_date.ctime(),
-    logs_days_ago=logs_days_ago,
-    total_visits=total_visits,
-    unique_visits=unique_visits,
-    most_visited_site=most_visited_site
-))
+print(
+    template.render(
+        seconds=seconds,
+        logs_start_date=logs_start_date.ctime(),
+        logs_days_ago=logs_days_ago,
+        total_visits=total_visits,
+        unique_visits=unique_visits,
+        most_visited_site=most_visited_site,
+    )
+)