Enhance hybrid search functionality with ZIP file support; refactor search logic for improved accuracy and performance.

2026-01-09 17:36:49 +01:00
parent f8e850e8f0
commit 4ea3793cec
1 changed files with 181 additions and 117 deletions
--- a/uff_app.py
+++ b/uff_app.py
@@ -3,9 +3,11 @@ import os
 import sqlite3
 import pdfplumber
 import numpy as np
 import zipfile  # WICHTIG: Für Zip-Dateien
 import io       # WICHTIG: Um Dateien im Arbeitsspeicher zu verarbeiten
 from sentence_transformers import SentenceTransformer, util
-# NEU: Für die Fuzzy-Logik
+# Für die Fuzzy-Logik & Suche
 from rapidfuzz import process, fuzz
 from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
@@ -15,32 +17,24 @@ from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
 from PyQt6.QtCore import Qt, QThread, pyqtSignal, QUrl
 from PyQt6.QtGui import QDesktopServices
-# --- 1. DATENBANK MANAGER (Mit Semantischer Suche) ---
+# --- 1. DATENBANK MANAGER (Mit Hybrid Search Scoring) ---
 class DatabaseHandler:
    def __init__(self):
        # ... (same as before)
        if os.name == 'nt':
            base_dir = os.getenv('LOCALAPPDATA')
        else:
            # Mac/Linux: ~/.local/share/uff_search
            base_dir = os.path.join(os.path.expanduser("~"), ".local", "share")
        # 2. Wir erstellen unseren eigenen Unterordner
        self.app_data_dir = os.path.join(base_dir, "UFF_Search")
        # Falls der Ordner nicht existiert, erstellen wir ihn
        if not os.path.exists(self.app_data_dir):
            os.makedirs(self.app_data_dir)
        # 3. Der Pfad zur Datenbank
        self.db_name = os.path.join(self.app_data_dir, "uff_index.db")
        # Debug-Info (falls du es im Terminal testest)
        print(f"Datenbank Pfad: {self.db_name}")
        # 4. Semantisches Modell laden
        # Wir geben dem User Feedback, weil das dauern kann
        print("Lade das semantische Modell (all-MiniLM-L6-v2)...")
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        print("Modell geladen.")
@@ -62,7 +56,7 @@ class DatabaseHandler:
                alias TEXT
            );
        """)
-        # NEU: Tabelle für die Vektor-Embeddings
+        # Tabelle für die Vektor-Embeddings
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS embeddings (
                doc_id INTEGER PRIMARY KEY,
@@ -86,16 +80,13 @@ class DatabaseHandler:
    def remove_folder(self, path):
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
        # Finde alle doc_ids, die zu dem Ordner gehören
        cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
        ids_to_delete = [row[0] for row in cursor.fetchall()]
        if ids_to_delete:
            # Lösche Einträge aus 'documents' und 'embeddings'
            cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids_to_delete))})", ids_to_delete)
        # Lösche den Ordner-Eintrag selbst
        cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
        conn.commit()
        conn.close()
@@ -109,7 +100,7 @@ class DatabaseHandler:
    def search(self, query):
        if not query.strip(): return []
-        # --- PHASE 1: SEMANTISCHE SUCHE ---
+        # --- PHASE 1: SEMANTISCHE SUCHE (Vektor) ---
        query_embedding = self.model.encode(query, convert_to_tensor=False)
        conn = sqlite3.connect(self.db_name)
@@ -120,82 +111,88 @@ class DatabaseHandler:
        doc_ids = [item[0] for item in all_embeddings_data]
-        # Konvertiere BLOBs zurück zu Vektoren
+        if not doc_ids:
            conn.close()
            return []
        # BLOBs zurück zu Vektoren
        all_embeddings = np.array([np.frombuffer(item[1], dtype=np.float32) for item in all_embeddings_data])
-        # Cosine Similarity berechnen
+        # Cosine Similarity (Werte zwischen -1 und 1)
-        semantic_scores = {}
+        # clip auf 0, da negative Werte hier irrelevant sind
        if len(all_embeddings) > 0:
        cos_scores = util.cos_sim(query_embedding, all_embeddings)[0].numpy()
        cos_scores = np.clip(cos_scores, 0, 1) 
-            for i, score in enumerate(cos_scores):
+        # Map: doc_id -> Semantic Score (0.0 - 1.0)
-                # Nur relevante Ergebnisse (>35% Ähnlichkeit) berücksichtigen
+        semantic_map = {doc_id: float(score) for doc_id, score in zip(doc_ids, cos_scores)}
                if score > 0.35:
                    # Wir gewichten die semantische Suche hoch (z.B. max 100 Pkt)
                    semantic_scores[doc_ids[i]] = float(score) * 100
-        # --- PHASE 2: STICHWORTSUCHE (FTS) ---
+        # --- PHASE 2: STICHWORTSUCHE (FTS & Fuzzy) ---
        words = query.replace('"', '').split()
        if not words: words = [query]
        sql_query_parts = [f'"{w}"*' for w in words]
        sql_query_string = " OR ".join(sql_query_parts)
-        sql = """
+        try:
-            SELECT rowid, filename, path, content
+            # Wir holen Kandidaten, die die Wörter enthalten
            fts_rows = cursor.execute("""
                SELECT rowid, filename, content 
                FROM documents 
                WHERE documents MATCH ? 
-            LIMIT 200
+                LIMIT 100
-        """
+            """, (sql_query_string,)).fetchall()
        try:
            fts_rows = cursor.execute(sql, (sql_query_string,)).fetchall()
        except:
            fts_rows = []
-        # --- PHASE 3: KOMBINATION & BEWERTUNG ---
+        lexical_map = {}
        combined_scores = {}
-        # Scores aus der semantischen Suche übernehmen
+        for doc_id, filename, content in fts_rows:
-        for doc_id, score in semantic_scores.items():
+            # Fuzzy-Score berechnen (0 bis 100) -> normalisieren auf 0.0 - 1.0
-            combined_scores[doc_id] = score
+            ratio_name = fuzz.partial_ratio(query.lower(), filename.lower())
            ratio_content = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower())
-        # Scores aus der FTS-Suche hinzufügen/kombinieren
+            best_ratio = max(ratio_name, ratio_content)
-        for doc_id, filename, path, content in fts_rows:
+            lexical_map[doc_id] = best_ratio / 100.0
            # Fuzzy-Score für Relevanz
            score_name = fuzz.WRatio(query.lower(), filename.lower())
            check_content = content[:5000] if content else ""
            score_content = fuzz.partial_token_set_ratio(query.lower(), check_content.lower())
            fuzzy_score = (score_name * 0.2) + (score_content * 0.8)
-            # Bonus für exakte Wort-Treffer
+        # --- PHASE 3: HYBRID FUSION (Kombination) ---
-            if all(w.lower() in (filename + check_content).lower() for w in words):
+        final_scores = {}
                fuzzy_score += 20
-            # Wenn das Dokument bereits durch die semantische Suche gefunden wurde,
+        # Gewichtung anpassen
-            # geben wir einen massiven Bonus. Ansonsten normaler Score.
+        ALPHA = 0.65  # 65% Semantik
-            if doc_id in combined_scores:
+        BETA = 0.35   # 35% Stichwort
                combined_scores[doc_id] += fuzzy_score + 50 # Bonus!
            else:
                combined_scores[doc_id] = fuzzy_score
-        # --- PHASE 4: SORTIEREN & ERGEBNISSE HOLEN ---
+        for doc_id, sem_score in semantic_map.items():
-        # Sortiere die doc_ids nach dem höchsten Score
+            # Filter: Nur Ergebnisse mit minimaler Relevanz betrachten
-        sorted_doc_ids = sorted(combined_scores.keys(), key=lambda doc_id: combined_scores[doc_id], reverse=True)
+            if sem_score < 0.15 and doc_id not in lexical_map:
                continue
-        # Top 50 Ergebnisse
+            lex_score = lexical_map.get(doc_id, 0.0)
-        final_results = []
+            
-        for doc_id in sorted_doc_ids[:50]:
+            # Hybrid Score
-            # Holen der Metadaten für die Anzeige
+            hybrid_score = (sem_score * ALPHA) + (lex_score * BETA)
-            res = cursor.execute(
+            
            # Bonus: Wenn beides hoch ist (Semantik UND Keyword)
            if sem_score > 0.4 and lex_score > 0.6:
                hybrid_score += 0.1
            final_scores[doc_id] = hybrid_score
        # --- PHASE 4: SORTIEREN & AUSGEBEN ---
        sorted_ids = sorted(final_scores.keys(), key=lambda x: final_scores[x], reverse=True)
        results = []
        for doc_id in sorted_ids[:50]: # Top 50 Ergebnisse
            row = cursor.execute(
                "SELECT filename, path, snippet(documents, 2, '<b>', '</b>', '...', 15) FROM documents WHERE rowid = ?", 
                (doc_id,)
            ).fetchone()
-            
+            if row:
-            if res:
+                results.append(row)
                final_results.append(res)
        conn.close()
-        return final_results
+        return results
-# --- 2. INDEXER (Unverändert) ---
+# --- 2. INDEXER (Mit ZIP Support & Recursion) ---
 class IndexerThread(QThread):
    progress_signal = pyqtSignal(str)
@@ -211,33 +208,62 @@ class IndexerThread(QThread):
    def stop(self):
        self.is_running = False
-    def _extract_text(self, filepath):
+    def _extract_text_from_stream(self, file_stream, filename):
-        ext = os.path.splitext(filepath)[1].lower()
+        """
        Liest Text aus einem Dateiobjekt (Stream) oder Pfad, basierend auf der Endung.
        Robuster gegen defekte PDF-Seiten.
        """
        ext = os.path.splitext(filename)[1].lower()
        text = ""
        try:
            if ext == ".pdf":
-                with pdfplumber.open(filepath) as pdf:
+                # pdfplumber kann direkt Dateiobjekte (BytesIO) lesen
-                    text = ""
+                try:
                    with pdfplumber.open(file_stream) as pdf:
                        for page in pdf.pages:
                            try:
                                # Versuch, Text von der einzelnen Seite zu holen
                                if page_text := page.extract_text():
                                    text += page_text + "\n"
-                    return text
+                            except Exception as e:
                                # Wenn eine Seite defekt ist (z.B. FontBBox Fehler), überspringen wir nur diese Seite
                                print(f"Warnung: Konnte eine Seite in '{filename}' nicht lesen (übersprungen). Fehler: {e}")
                                continue
                except Exception as e:
                    # Wenn die ganze PDF nicht geöffnet werden kann
                    print(f"Warnung: PDF '{filename}' konnte nicht geöffnet werden. Fehler: {e}")
                    return None
            elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
-                with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+                # Wir lesen die Bytes und decodieren sie
-                    return f.read()
+                if hasattr(file_stream, 'read'):
-            return None
+                    content_bytes = file_stream.read()
-        except:
+                    if isinstance(content_bytes, str): 
                        # Fallback
                        with open(file_stream, 'r', encoding='utf-8', errors='ignore') as f:
                            text = f.read()
                    else:
                        text = content_bytes.decode('utf-8', errors='ignore')
                else:
                    # Echter Dateipfad
                    with open(file_stream, "r", encoding="utf-8", errors="ignore") as f:
                        text = f.read()
        except Exception as e:
            # Allgemeiner Fehler beim Lesen
            # print(f"Lese-Fehler bei {filename}: {e}")
            return None
        return text
    def run(self):
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
-        # Finde alle doc_ids, die zu dem Ordner gehören, um sie später zu löschen
+        # Bereinigen alter Einträge
        cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
        ids_to_delete = [row[0] for row in cursor.fetchall()]
        if ids_to_delete:
            # Lösche alte Einträge aus 'documents' und 'embeddings'
            cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids_to_delete))})", ids_to_delete)
            conn.commit()
@@ -246,43 +272,73 @@ class IndexerThread(QThread):
        skipped = 0
        was_cancelled = False
        # --- REKURSIVES DURCHSUCHEN ---
        for root, dirs, files in os.walk(self.folder_path):
            if not self.is_running:
                was_cancelled = True
                break
            for file in files:
                if not self.is_running:
                    was_cancelled = True
                    break
-                self.progress_signal.emit(f"Lese: {file}...")
+                file_path = os.path.join(root, file)
-                path = os.path.join(root, file)
+                self.progress_signal.emit(f"Prüfe: {file}...")
                content = self._extract_text(path)
-                if content and len(content.strip()) > 0:
+                # A. ZIP-DATEIEN BEHANDELN
-                    # 1. In FTS-Tabelle einfügen
+                if file.lower().endswith('.zip'):
-                    cursor.execute(
+                    try:
-                        "INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", 
+                        with zipfile.ZipFile(file_path, 'r') as z:
-                        (file, path, content)
+                            for z_info in z.infolist():
-                    )
+                                if z_info.is_dir(): continue
                    doc_id = cursor.lastrowid
-                    # 2. Embedding erstellen und in BLOB umwandeln
+                                # Virtueller Pfad: C:\Ordner\Archiv.zip :: innen/datei.txt
-                    embedding = self.model.encode(content[:8192], convert_to_tensor=False)
+                                virtual_path = f"{file_path} :: {z_info.filename}"
                    embedding_blob = embedding.tobytes()
-                    # 3. Embedding in Tabelle einfügen
+                                with z.open(z_info) as z_file:
-                    cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (doc_id, embedding_blob))
+                                    # Inhalt in RAM laden (BytesIO)
                                    file_in_memory = io.BytesIO(z_file.read())
                                    content = self._extract_text_from_stream(file_in_memory, z_info.filename)
                                    if content and len(content.strip()) > 20:
                                        self._save_to_db(cursor, z_info.filename, virtual_path, content)
                                        indexed += 1
                    except Exception as e:
                        print(f"Zip Error {file}: {e}")
                        skipped += 1
                # B. NORMALE DATEIEN
                else:
                    content = self._extract_text_from_stream(file_path, file)
                    if content and len(content.strip()) > 20:
                        self._save_to_db(cursor, file, file_path, content)
                        indexed += 1
                    else:
                        skipped += 1
            if was_cancelled: break
        conn.commit()
        conn.close()
        self.finished_signal.emit(indexed, skipped, was_cancelled)
    def _save_to_db(self, cursor, filename, path, content):
        # 1. Text speichern
        cursor.execute(
            "INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", 
            (filename, path, content)
        )
        doc_id = cursor.lastrowid
        # 2. Embedding erstellen (Max 8000 chars)
        embedding = self.model.encode(content[:8000], convert_to_tensor=False)
        embedding_blob = embedding.tobytes()
        # 3. Vektor speichern
        cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (doc_id, embedding_blob))
 # --- 3. UI (Unverändert) ---
 class UffWindow(QMainWindow):
@@ -294,14 +350,13 @@ class UffWindow(QMainWindow):
        self.load_saved_folders()
    def initUI(self):
-        self.setWindowTitle("UFF Text Search v4.0 (Semantic)")
+        self.setWindowTitle("UFF Text Search v5.0 (Hybrid Zip)")
        self.resize(1000, 700)
        central = QWidget()
        self.setCentralWidget(central)
        main_layout = QHBoxLayout(central)
        # ... (UI initialisation remains the same)
        # LINKS
        left_panel = QFrame()
        left_panel.setFixedWidth(250)
@@ -342,7 +397,7 @@ class UffWindow(QMainWindow):
        search_container = QHBoxLayout()
        self.input_search = QLineEdit()
-        self.input_search.setPlaceholderText("Suchbegriff... (Semantische Suche aktiv)")
+        self.input_search.setPlaceholderText("Suche... (Hybrid: Inhalt + Keywords)")
        self.input_search.returnPressed.connect(self.perform_search)
        self.input_search.setStyleSheet("padding: 8px; font-size: 14px;")
@@ -353,7 +408,7 @@ class UffWindow(QMainWindow):
        search_container.addWidget(self.input_search)
        search_container.addWidget(btn_go)
-        self.lbl_status = QLabel("Bereit. Semantisches Modell geladen.")
+        self.lbl_status = QLabel("Bereit. Hybrid-Modell geladen.")
        self.lbl_status.setStyleSheet("color: #666;")
        self.progress_bar = QProgressBar()
        self.progress_bar.hide()
@@ -375,8 +430,6 @@ class UffWindow(QMainWindow):
        main_layout.addWidget(splitter)
    # ... (Rest of UI Class)
    # LOGIK
    def load_saved_folders(self):
        self.folder_list.clear()
@@ -417,7 +470,6 @@ class UffWindow(QMainWindow):
        self.set_ui_busy(True)
        self.lbl_status.setText(f"Starte... {os.path.basename(folder)}")
        # Dem Thread jetzt das Modell mitgeben
        self.indexer_thread = IndexerThread(folder, db_name=self.db.db_name, model=self.db.model)
        self.indexer_thread.progress_signal.connect(lambda msg: self.lbl_status.setText(msg))
@@ -453,7 +505,9 @@ class UffWindow(QMainWindow):
        query = self.input_search.text()
        if not query: return
-        # Suche ausführen (jetzt mit Fuzzy!)
+        self.lbl_status.setText("Suche läuft...")
        QApplication.processEvents()
        results = self.db.search(query)
        self.lbl_status.setText(f"{len(results)} relevante Treffer.")
@@ -462,14 +516,24 @@ class UffWindow(QMainWindow):
            html = "<h3 style='color: gray; text-align: center; margin-top: 20px;'>Nichts gefunden.</h3>"
        for filename, filepath, snippet in results:
-            file_url = QUrl.fromLocalFile(filepath).toString()
+            # Falls es eine Datei im Zip ist, müssen wir den Link anpassen,
            # damit er zumindest das Zip öffnet.
            if " :: " in filepath:
                real_path = filepath.split(" :: ")[0]
                display_path = filepath # Zeige den virtuellen Pfad
            else:
                real_path = filepath
                display_path = filepath
            file_url = QUrl.fromLocalFile(real_path).toString()
            html += f"""
            <div style='margin-bottom: 10px; padding: 10px; background-color: #f9f9f9; border-left: 4px solid #2980b9;'>
                <a href="{file_url}" style='font-size: 16px; font-weight: bold; color: #2980b9; text-decoration: none;'>
                    {filename}
                </a>
                <div style='color: #333; margin-top: 5px; font-family: sans-serif; font-size: 13px;'>{snippet}</div>
-                <div style='color: #999; font-size: 11px; margin-top: 4px;'>{filepath}</div>
+                <div style='color: #999; font-size: 11px; margin-top: 4px;'>{display_path}</div>
            </div>
            """
        self.result_browser.setHtml(html)