add some docstring

2026-01-10 13:23:01 +01:00
parent 17b506db77
commit 1855810c14
5 changed files with 208 additions and 67 deletions
--- a/database.py
+++ b/database.py
@@ -2,19 +2,31 @@
 import sqlite3
 import os
 import numpy as np
-import traceback  # WICHTIG: Damit wir den vollen Fehler sehen
+import traceback 
 from sentence_transformers import util
 from rapidfuzz import fuzz
 from config import DB_NAME, APP_DATA_DIR

 class DatabaseHandler:
+    """
+    Handles all database operations, including initialization,
+    folder management, and searching.
+    """
    def __init__(self):
+        """
+        Initializes the DatabaseHandler, sets up the database path,
+        and initializes the database schema.
+        """
        self.app_data_dir = APP_DATA_DIR
        self.db_name = DB_NAME
        self.model = None 
        self.init_db()

    def init_db(self):
+        """
+        Initializes the database schema by creating the necessary tables
+        (documents, folders, embeddings) if they don't already exist.
+        """
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
        cursor.execute("CREATE VIRTUAL TABLE IF NOT EXISTS documents USING fts5(filename, path, content);")
@@ -24,45 +36,82 @@ class DatabaseHandler:
        conn.close()

    def add_folder(self, path):
+        """
+        Adds a new folder path to the database to be indexed.
+
+        Args:
+            path (str): The absolute path of the folder to add.
+
+        Returns:
+            bool: True if the folder was added successfully, False otherwise.
+        """
        conn = sqlite3.connect(self.db_name)
        try:
            conn.execute("INSERT OR IGNORE INTO folders (path, alias) VALUES (?, ?)", (path, os.path.basename(path)))
            conn.commit()
            return True
-        except: return False
-        finally: conn.close()
+        except Exception:
+            return False
+        finally:
+            conn.close()

    def remove_folder(self, path):
+        """
+        Removes a folder and all its associated indexed files from the database.
+
+        Args:
+            path (str): The absolute path of the folder to remove.
+        """
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
+        # Find all document IDs associated with the folder path
        cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
        ids = [row[0] for row in cursor.fetchall()]
        if ids:
+            # Delete documents and their embeddings
            cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
-            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
+            placeholders = ','.join('?' * len(ids))
+            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
+        # Remove the folder entry
        cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
        conn.commit()
        conn.close()

    def get_folders(self):
+        """
+        Retrieves a list of all indexed folder paths.
+
+        Returns:
+            list: A list of folder paths.
+        """
        conn = sqlite3.connect(self.db_name)
        rows = conn.execute("SELECT path FROM folders").fetchall()
        conn.close()
        return [r[0] for r in rows]

    def search(self, query):
-        # Sicherheitscheck
+        """
+        Performs a hybrid search combining semantic and lexical (keyword) search.
+
+        Args:
+            query (str): The search query.
+
+        Returns:
+            list: A list of search results, each containing
+                  (filename, path, snippet).
+        """
+        # Safety check
        if not query.strip() or not self.model: 
            return []
        
        try:
-            # 1. Semantische Vorbereitung
+            # 1. Semantic Preparation
            q_vec = self.model.encode(query, convert_to_tensor=False)
            
            conn = sqlite3.connect(self.db_name)
            cursor = conn.cursor()
            
-            # Embeddings laden
+            # Load embeddings
            cursor.execute("SELECT doc_id, vec FROM embeddings")
            data = cursor.fetchall()
            doc_ids = [d[0] for d in data]
@@ -71,16 +120,16 @@ class DatabaseHandler:
                conn.close()
                return []

-            # Umwandlung BLOB -> Numpy Array
-            # Hier knallt es oft, wenn die DB korrupt ist oder Dimensionen nicht passen
+            # Convert BLOB -> Numpy Array
+            # This can fail if the DB is corrupt or dimensions mismatch
            vecs = np.array([np.frombuffer(d[1], dtype=np.float32) for d in data])
            
-            # Cosine Similarity berechnen
+            # Calculate Cosine Similarity
            scores = util.cos_sim(q_vec, vecs)[0].numpy()
            scores = np.clip(scores, 0, 1)
            sem_map = {did: float(s) for did, s in zip(doc_ids, scores)}

-            # 2. Lexikalische Suche (FTS)
+            # 2. Lexical Search (FTS)
            words = query.replace('"', '').split()
            if not words: words = [query]
            fts_query = " OR ".join([f'"{w}"*' for w in words])
@@ -88,29 +137,29 @@ class DatabaseHandler:
            try:
                fts_rows = cursor.execute("SELECT rowid, filename, content FROM documents WHERE documents MATCH ? LIMIT 100", (fts_query,)).fetchall()
            except Exception as e:
-                print(f"FTS Fehler (ignoriert): {e}")
+                print(f"FTS Error (ignored): {e}")
                fts_rows = []

            lex_map = {}
            for did, fname, content in fts_rows:
                r1 = fuzz.partial_ratio(query.lower(), fname.lower())
-                # Content kürzen für Performance
+                # Truncate content for performance
                r2 = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower())
                lex_map[did] = max(r1, r2) / 100.0

            # 3. Hybrid Fusion
            final = {}
-            ALPHA = 0.65
-            BETA = 0.35
+            ALPHA = 0.65  # Weight for semantic score
+            BETA = 0.35   # Weight for lexical score
            for did, s_score in sem_map.items():
                if s_score < 0.15 and did not in lex_map: continue
                l_score = lex_map.get(did, 0.0)
                h_score = (s_score * ALPHA) + (l_score * BETA)
-                # Kleiner Boost wenn beides passt
+                # Small boost if both scores are good
                if s_score > 0.4 and l_score > 0.6: h_score += 0.1
                final[did] = h_score

-            # 4. Ergebnisse holen
+            # 4. Fetch Results
            sorted_ids = sorted(final.keys(), key=lambda x: final[x], reverse=True)[:50]
            results = []
            for did in sorted_ids:
@@ -121,8 +170,8 @@ class DatabaseHandler:
            return results

        except Exception as e:
-            # DIESER TEIL IST NEU: Er schreibt den Fehler ins Logfile
-            print(f"!!! KRITISCHER FEHLER IN DER SUCHE !!!")
-            print(f"Fehler: {e}")
+            # NEW: This part writes the error to the log file
+            print(f"!!! CRITICAL ERROR IN SEARCH !!!")
+            print(f"Error: {e}")
            print(traceback.format_exc())
            return []