From 5261442c30c713120ee84da8bc9114494e7151cf Mon Sep 17 00:00:00 2001 From: John Turner Date: Wed, 1 Oct 2025 00:05:11 -0400 Subject: impl deduplication for the sqlite storage backend --- pypaste/server/sqlite/__init__.py | 63 +++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/pypaste/server/sqlite/__init__.py b/pypaste/server/sqlite/__init__.py index 9ae49ad..3d5e1b1 100644 --- a/pypaste/server/sqlite/__init__.py +++ b/pypaste/server/sqlite/__init__.py @@ -4,6 +4,7 @@ import aiosqlite from pypaste.server import Storage, Paste, Key from dataclasses import dataclass from typing import Optional +from uuid import uuid4, UUID @dataclass @@ -12,7 +13,11 @@ class Sqlite(Storage): async def setup(self) -> None: await self.connection.execute( - "create table if not exists sqlite(key blob, data blob)" + "create table if not exists sqliterefs(key blob, refid blob)" + ) + + await self.connection.execute( + "create table if not exists sqlitedata(refid blob, data blob unique)" ) await self.connection.commit() @@ -26,11 +31,24 @@ class Sqlite(Storage): try: await self.connection.execute( "insert into pastes values(?, ?, ?, ?, ?)", - (key.data, key.length, paste.dt.isoformat(), len(data), paste.syntax), + ( + key.data, + key.length, + paste.dt.isoformat(), + len(data), + paste.syntax, + ), + ) + + await self.connection.execute( + "insert into sqlitedata values(?, ?) on conflict(data) do nothing", + (uuid4().bytes, data), ) await self.connection.execute( - "insert into sqlite values(?, ?)", + ( + "insert into sqliterefs values(?, (select refid from sqlitedata where data=?))" + ), ( key.data, data, @@ -44,24 +62,34 @@ class Sqlite(Storage): async def retrieve(self, key: Key) -> Optional[Paste]: async with self.connection.execute( - "select sqlite.data from sqlite where key=? limit 1", (key.data,) + "select refid from sqliterefs where key=?", (key.data,) ) as cursor: match await cursor.fetchone(): case [bytes(data)]: - pass + refid = UUID(bytes=data) case None: return None case _: raise Exception("unreachable") + async with self.connection.execute( + "select data from sqlitedata where refid=?", (refid.bytes,) + ) as cursor: + match await cursor.fetchone(): + case [bytes(data)]: + pass + case _: + raise Exception("unreachable") + info = await self.read_paste_info(key) + # info being None should not be possible unless something is broken assert info is not None def decompress(): - return zstandard.decompress(data).decode() + return zstandard.decompress(data) - text = await asyncio.to_thread(decompress) + text = (await asyncio.to_thread(decompress)).decode() return Paste(info.dt, info.syntax, text) @@ -69,7 +97,18 @@ class Sqlite(Storage): try: await self.connection.execute("delete from pastes where key=?", (key.data,)) - await self.connection.execute("delete from sqlite where key=?", (key.data,)) + await self.connection.execute( + "delete from sqliterefs where key=?", (key.data,) + ) + + await self.connection.execute( + ( + "delete from sqlitedata " + "where not exists " + "(select 1 from sqliterefs " + "where sqliterefs.refid=sqlitedata.refid)" + ) + ) await self.connection.commit() except Exception as e: @@ -79,7 +118,7 @@ class Sqlite(Storage): async def exists(self, key: Key) -> bool: async with self.connection.execute( - "select 1 from sqlite where key=?", (key.data,) + "select 1 from sqliterefs where key=?", (key.data,) ) as cursor: return await cursor.fetchone() is not None @@ -87,13 +126,13 @@ class Sqlite(Storage): async with self.connection.execute( ( "select sum(pastes.size) from pastes " - "inner join sqlite on sqlite.key=pastes.key" + "inner join sqliterefs on sqliterefs.key=pastes.key" ) ) as cursor: match await cursor.fetchone(): case [int(use)]: return use - case None: + case None | [None]: # it can return a tuple with a None in it return None case _: raise Exception("unreachable") @@ -102,7 +141,7 @@ class Sqlite(Storage): async with self.connection.execute( ( "select pastes.key,pastes.key_length from pastes " - "inner join sqlite on sqlite.key=pastes.key " + "inner join sqliterefs on sqliterefs.key=pastes.key " "order by pastes.datetime" ) ) as cursor: -- cgit v1.2.3