]> jturnerusa.dev Git - website/commitdiff
process line-by-line and add css styling to template
authorJohn Turner <jturner.usa@gmail.com>
Wed, 16 Jul 2025 03:01:13 +0000 (23:01 -0400)
committerJohn Turner <jturner.usa@gmail.com>
Wed, 16 Jul 2025 03:01:13 +0000 (23:01 -0400)
stats.py

index 544e8fa4bd9e44a52ae9501b68f5e3ed985bea04..824bfa4e0e30dde2b98b277db21b2a295cc0b667 100755 (executable)
--- a/stats.py
+++ b/stats.py
@@ -7,44 +7,53 @@ from dataclasses import dataclass
 from jinja2 import Environment
 from time import time
 
-GET_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"GET ([^ ]*)"
+LINE_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"([A-Z]+) ([^ ]+) HTTP/\d\.\d\""
 
 TEMPLATE="""
 <!DOCTYPE html>
 
 <html lang="en">
 
-<head>
-  <meta charset="utf-8">
-  <title>Most visited pages</title>
-</head>
-<body>
-  <h1>Stats</h1>
-  <p>
-     Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago)
-  </p>
-  <p>
-     Stats generated in {{ seconds }}s
-  </p>
-  <p>
-    Total visits: {{ total_visits }}
-  </p>
-  <p>
-    Unique visits: {{ unique_visits }}
-  </p>
-  <p>
-    Most visited url: {{ most_visited_site }}
-  </p>
-</body>
-
+  <head>
+    <meta charset="utf-8">
+    <title>Most visited pages</title>
+    <style type="text/css">
+      .stats {
+        text-align: center
+      }
+    </style>
+  </head>
+
+  <body>
+    <div class="stats">
+    <h1>Stats</h1>
+    <p>
+      Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago)
+    </p>
+    <p>
+      Stats generated in {{ seconds }}s
+    </p>
+    <p>
+      Total visits: {{ total_visits }}
+    </p>
+    <p>
+      Unique visits: {{ unique_visits }}
+    </p>
+    <p>
+      Most visited url: {{ most_visited_site }}
+    </p>
+    </div>
+  </body>
 </html>
 """
 
 @dataclass
-class Get:
+class Line:
     source: str
     dest: str
-    url: str
+    time: str
+    kind: str
+    resource: str
 
 start_time = time()
 
@@ -54,44 +63,47 @@ log_files = [log for log in logs_dir.iterdir()
              if log.is_file()
              and log.name.startswith("access.log")]
 
-gets = []
+urls = {}
+seen = set()
+total_visits = 0
 
 for file in log_files:
     if file.suffix == ".gz":
-        decoder = gzip.open(file)
-        text = decoder.read().decode()
+        reader = gzip.open(file, "rt")
     else:
-        text = file.read_text()
-
-    for match in re.finditer(GET_PATTERN, text, flags=re.MULTILINE):
-        get = Get(match.group(1), match.group(2), match.group(4))
-        gets.append(get)
-
-urls = {}
-
-for get in gets:
-    try:
-        count = urls[get.url]
-    except KeyError:
-        count = 0
-
-    urls[get.url] = count + 1
+        reader = file.open()
+
+    for line in reader:
+        if (match := re.match(LINE_PATTERN, line)):
+            line = Line(
+                match.group(1),
+                match.group(2),
+                match.group(3),
+                match.group(4),
+                match.group(5)
+            )
+
+            if line.kind == "GET":
+                try:
+                    count = urls[line.resource]
+                except KeyError:
+                    count = 0
+                    
+                urls[line.resource] = count + 1
+                total_visits += 1
+                seen.add(line.source)
+        
 
 first_log_file = log_files[0]
 logs_start_date = datetime.fromtimestamp(first_log_file.stat().st_ctime)
 logs_days_ago = (datetime.now() - logs_start_date).days
-
-total_visits = len(gets)
-unique_visits = len(set(get.source for get in gets))
+unique_visits = len(seen)
 most_visited_site = sorted(urls.items(), key=lambda item: item[1])[-1][0]
+seconds = round(time() - start_time, 3)
 
 environment = Environment()
 template = environment.from_string(TEMPLATE)
 
-seconds = round(time() - start_time, 3)
-
-print("Content-Type: text/html\r\n")
-
 print(template.render(
     seconds=seconds,
     logs_start_date=logs_start_date.ctime(),