from jinja2 import Environment
from time import time
-GET_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"GET ([^ ]*)"
+LINE_PATTERN = r"^([^ ]+) ([^ ]+) - \[(.*)\] \"([A-Z]+) ([^ ]+) HTTP/\d\.\d\""
TEMPLATE="""
<!DOCTYPE html>
<html lang="en">
-<head>
- <meta charset="utf-8">
- <title>Most visited pages</title>
-</head>
-<body>
- <h1>Stats</h1>
- <p>
- Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago)
- </p>
- <p>
- Stats generated in {{ seconds }}s
- </p>
- <p>
- Total visits: {{ total_visits }}
- </p>
- <p>
- Unique visits: {{ unique_visits }}
- </p>
- <p>
- Most visited url: {{ most_visited_site }}
- </p>
-</body>
-
+ <head>
+ <meta charset="utf-8">
+ <title>Most visited pages</title>
+ <style type="text/css">
+ .stats {
+ text-align: center
+ }
+ </style>
+ </head>
+
+ <body>
+ <div class="stats">
+ <h1>Stats</h1>
+ <p>
+ Logs start at {{ logs_start_date }} ({{ logs_days_ago }} days ago)
+ </p>
+ <p>
+ Stats generated in {{ seconds }}s
+ </p>
+ <p>
+ Total visits: {{ total_visits }}
+ </p>
+ <p>
+ Unique visits: {{ unique_visits }}
+ </p>
+ <p>
+ Most visited url: {{ most_visited_site }}
+ </p>
+ </div>
+ </body>
</html>
"""
@dataclass
-class Get:
+class Line:
source: str
dest: str
- url: str
+ time: str
+ kind: str
+ resource: str
start_time = time()
if log.is_file()
and log.name.startswith("access.log")]
-gets = []
+urls = {}
+seen = set()
+total_visits = 0
for file in log_files:
if file.suffix == ".gz":
- decoder = gzip.open(file)
- text = decoder.read().decode()
+ reader = gzip.open(file, "rt")
else:
- text = file.read_text()
-
- for match in re.finditer(GET_PATTERN, text, flags=re.MULTILINE):
- get = Get(match.group(1), match.group(2), match.group(4))
- gets.append(get)
-
-urls = {}
-
-for get in gets:
- try:
- count = urls[get.url]
- except KeyError:
- count = 0
-
- urls[get.url] = count + 1
+ reader = file.open()
+
+ for line in reader:
+ if (match := re.match(LINE_PATTERN, line)):
+ line = Line(
+ match.group(1),
+ match.group(2),
+ match.group(3),
+ match.group(4),
+ match.group(5)
+ )
+
+ if line.kind == "GET":
+ try:
+ count = urls[line.resource]
+ except KeyError:
+ count = 0
+
+ urls[line.resource] = count + 1
+ total_visits += 1
+ seen.add(line.source)
+
first_log_file = log_files[0]
logs_start_date = datetime.fromtimestamp(first_log_file.stat().st_ctime)
logs_days_ago = (datetime.now() - logs_start_date).days
-
-total_visits = len(gets)
-unique_visits = len(set(get.source for get in gets))
+unique_visits = len(seen)
most_visited_site = sorted(urls.items(), key=lambda item: item[1])[-1][0]
+seconds = round(time() - start_time, 3)
environment = Environment()
template = environment.from_string(TEMPLATE)
-seconds = round(time() - start_time, 3)
-
-print("Content-Type: text/html\r\n")
-
print(template.render(
seconds=seconds,
logs_start_date=logs_start_date.ctime(),