add semantic search
This commit is contained in:
@@ -2,3 +2,7 @@ pdfplumber
|
|||||||
pdfminer.six
|
pdfminer.six
|
||||||
rapidfuzz
|
rapidfuzz
|
||||||
PyQt6
|
PyQt6
|
||||||
|
sentence-transformers==2.2.2
|
||||||
|
transformers==4.28.1
|
||||||
|
torch==1.13.1
|
||||||
|
numpy==1.24.2
|
||||||
|
|||||||
182
uff_app.py
182
uff_app.py
@@ -2,6 +2,8 @@ import sys
|
|||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
import numpy as np
|
||||||
|
from sentence_transformers import SentenceTransformer, util
|
||||||
|
|
||||||
# NEU: Für die Fuzzy-Logik
|
# NEU: Für die Fuzzy-Logik
|
||||||
from rapidfuzz import process, fuzz
|
from rapidfuzz import process, fuzz
|
||||||
@@ -13,12 +15,11 @@ from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
|
|||||||
from PyQt6.QtCore import Qt, QThread, pyqtSignal, QUrl
|
from PyQt6.QtCore import Qt, QThread, pyqtSignal, QUrl
|
||||||
from PyQt6.QtGui import QDesktopServices
|
from PyQt6.QtGui import QDesktopServices
|
||||||
|
|
||||||
# --- 1. DATENBANK MANAGER (Mit Fuzzy-Ranking) ---
|
# --- 1. DATENBANK MANAGER (Mit Semantischer Suche) ---
|
||||||
|
|
||||||
class DatabaseHandler:
|
class DatabaseHandler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# 1. Wir ermitteln den korrekten AppData Ordner für den User
|
# ... (same as before)
|
||||||
# Windows: C:\Users\Name\AppData\Local\UFF_Search
|
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
base_dir = os.getenv('LOCALAPPDATA')
|
base_dir = os.getenv('LOCALAPPDATA')
|
||||||
else:
|
else:
|
||||||
@@ -38,21 +39,36 @@ class DatabaseHandler:
|
|||||||
# Debug-Info (falls du es im Terminal testest)
|
# Debug-Info (falls du es im Terminal testest)
|
||||||
print(f"Datenbank Pfad: {self.db_name}")
|
print(f"Datenbank Pfad: {self.db_name}")
|
||||||
|
|
||||||
|
# 4. Semantisches Modell laden
|
||||||
|
# Wir geben dem User Feedback, weil das dauern kann
|
||||||
|
print("Lade das semantische Modell (all-MiniLM-L6-v2)...")
|
||||||
|
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
print("Modell geladen.")
|
||||||
|
|
||||||
self.init_db()
|
self.init_db()
|
||||||
|
|
||||||
def init_db(self):
|
def init_db(self):
|
||||||
conn = sqlite3.connect(self.db_name)
|
conn = sqlite3.connect(self.db_name)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
# FTS-Tabelle für die Stichwortsuche
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE VIRTUAL TABLE IF NOT EXISTS documents
|
CREATE VIRTUAL TABLE IF NOT EXISTS documents
|
||||||
USING fts5(filename, path, content);
|
USING fts5(filename, path, content);
|
||||||
""")
|
""")
|
||||||
|
# Tabelle für die Ordner
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS folders (
|
CREATE TABLE IF NOT EXISTS folders (
|
||||||
path TEXT PRIMARY KEY,
|
path TEXT PRIMARY KEY,
|
||||||
alias TEXT
|
alias TEXT
|
||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
|
# NEU: Tabelle für die Vektor-Embeddings
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings (
|
||||||
|
doc_id INTEGER PRIMARY KEY,
|
||||||
|
vec BLOB
|
||||||
|
);
|
||||||
|
""")
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@@ -69,8 +85,18 @@ class DatabaseHandler:
|
|||||||
|
|
||||||
def remove_folder(self, path):
|
def remove_folder(self, path):
|
||||||
conn = sqlite3.connect(self.db_name)
|
conn = sqlite3.connect(self.db_name)
|
||||||
conn.execute("DELETE FROM folders WHERE path = ?", (path,))
|
cursor = conn.cursor()
|
||||||
conn.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
|
# Finde alle doc_ids, die zu dem Ordner gehören
|
||||||
|
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
|
||||||
|
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if ids_to_delete:
|
||||||
|
# Lösche Einträge aus 'documents' und 'embeddings'
|
||||||
|
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
|
||||||
|
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids_to_delete))})", ids_to_delete)
|
||||||
|
|
||||||
|
# Lösche den Ordner-Eintrag selbst
|
||||||
|
cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@@ -83,70 +109,91 @@ class DatabaseHandler:
|
|||||||
def search(self, query):
|
def search(self, query):
|
||||||
if not query.strip(): return []
|
if not query.strip(): return []
|
||||||
|
|
||||||
conn = sqlite3.connect(self.db_name)
|
# --- PHASE 1: SEMANTISCHE SUCHE ---
|
||||||
|
query_embedding = self.model.encode(query, convert_to_tensor=False)
|
||||||
|
|
||||||
# 1. Versuch: Strikte Datenbank-Suche (Schnell)
|
conn = sqlite3.connect(self.db_name)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("SELECT doc_id, vec FROM embeddings")
|
||||||
|
all_embeddings_data = cursor.fetchall()
|
||||||
|
|
||||||
|
doc_ids = [item[0] for item in all_embeddings_data]
|
||||||
|
|
||||||
|
# Konvertiere BLOBs zurück zu Vektoren
|
||||||
|
all_embeddings = np.array([np.frombuffer(item[1], dtype=np.float32) for item in all_embeddings_data])
|
||||||
|
|
||||||
|
# Cosine Similarity berechnen
|
||||||
|
semantic_scores = {}
|
||||||
|
if len(all_embeddings) > 0:
|
||||||
|
cos_scores = util.cos_sim(query_embedding, all_embeddings)[0].numpy()
|
||||||
|
|
||||||
|
for i, score in enumerate(cos_scores):
|
||||||
|
# Nur relevante Ergebnisse (>35% Ähnlichkeit) berücksichtigen
|
||||||
|
if score > 0.35:
|
||||||
|
# Wir gewichten die semantische Suche hoch (z.B. max 100 Pkt)
|
||||||
|
semantic_scores[doc_ids[i]] = float(score) * 100
|
||||||
|
|
||||||
|
# --- PHASE 2: STICHWORTSUCHE (FTS) ---
|
||||||
words = query.replace('"', '').split()
|
words = query.replace('"', '').split()
|
||||||
# Wir suchen nach "Wort*" -> findet Wortanfänge
|
|
||||||
sql_query_parts = [f'"{w}"*' for w in words]
|
sql_query_parts = [f'"{w}"*' for w in words]
|
||||||
sql_query_string = " OR ".join(sql_query_parts)
|
sql_query_string = " OR ".join(sql_query_parts)
|
||||||
|
|
||||||
sql = """
|
sql = """
|
||||||
SELECT filename, path, snippet(documents, 2, '<b>', '</b>', '...', 15), content
|
SELECT rowid, filename, path, content
|
||||||
FROM documents
|
FROM documents
|
||||||
WHERE documents MATCH ?
|
WHERE documents MATCH ?
|
||||||
LIMIT 200
|
LIMIT 200
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = conn.execute(sql, (sql_query_string,)).fetchall()
|
fts_rows = cursor.execute(sql, (sql_query_string,)).fetchall()
|
||||||
except:
|
except:
|
||||||
rows = []
|
fts_rows = []
|
||||||
|
|
||||||
# 2. Versuch (FALLBACK): Wenn DB nichts findet, laden wir ALLES
|
# --- PHASE 3: KOMBINATION & BEWERTUNG ---
|
||||||
# Das ist der "Panic Mode" für starke Tippfehler (wie "vertraaag")
|
combined_scores = {}
|
||||||
if len(rows) < 5:
|
|
||||||
# Wir holen einfach mal die ersten 1000 Dokumente ohne Filter
|
|
||||||
fallback_sql = """
|
|
||||||
SELECT filename, path, snippet(documents, 2, '<b>', '</b>', '...', 15), content
|
|
||||||
FROM documents
|
|
||||||
LIMIT 1000
|
|
||||||
"""
|
|
||||||
rows = conn.execute(fallback_sql).fetchall()
|
|
||||||
|
|
||||||
conn.close()
|
# Scores aus der semantischen Suche übernehmen
|
||||||
|
for doc_id, score in semantic_scores.items():
|
||||||
|
combined_scores[doc_id] = score
|
||||||
|
|
||||||
# 3. Python Fuzzy Re-Ranking (RapidFuzz)
|
# Scores aus der FTS-Suche hinzufügen/kombinieren
|
||||||
scored_results = []
|
for doc_id, filename, path, content in fts_rows:
|
||||||
|
# Fuzzy-Score für Relevanz
|
||||||
for filename, path, snippet, content in rows:
|
|
||||||
# Wir berechnen Scores mit besserer Gewichtung
|
|
||||||
score_name = fuzz.WRatio(query.lower(), filename.lower())
|
score_name = fuzz.WRatio(query.lower(), filename.lower())
|
||||||
|
|
||||||
# Content-Check: Wir nehmen Content (falls snippet zu kurz ist)
|
|
||||||
# Begrenzung auf die ersten 5000 Zeichen für Performance
|
|
||||||
check_content = content[:5000] if content else ""
|
check_content = content[:5000] if content else ""
|
||||||
score_content = fuzz.partial_token_set_ratio(query.lower(), check_content.lower())
|
score_content = fuzz.partial_token_set_ratio(query.lower(), check_content.lower())
|
||||||
|
fuzzy_score = (score_name * 0.2) + (score_content * 0.8)
|
||||||
|
|
||||||
# Gewichteter Durchschnitt: Inhalt ist wichtiger als Dateiname
|
# Bonus für exakte Wort-Treffer
|
||||||
final_score = (score_name * 0.2) + (score_content * 0.8)
|
|
||||||
|
|
||||||
# Bonus für exakte Wort-Treffer (jetzt stärker)
|
|
||||||
if all(w.lower() in (filename + check_content).lower() for w in words):
|
if all(w.lower() in (filename + check_content).lower() for w in words):
|
||||||
final_score += 20
|
fuzzy_score += 20
|
||||||
|
|
||||||
# Filter: Nur anzeigen, wenn Score halbwegs okay ist
|
# Wenn das Dokument bereits durch die semantische Suche gefunden wurde,
|
||||||
# Bei "vertraaag" vs "vertrag" ist der Score meist > 70
|
# geben wir einen massiven Bonus. Ansonsten normaler Score.
|
||||||
if final_score > 55:
|
if doc_id in combined_scores:
|
||||||
scored_results.append({
|
combined_scores[doc_id] += fuzzy_score + 50 # Bonus!
|
||||||
"score": final_score,
|
else:
|
||||||
"data": (filename, path, snippet)
|
combined_scores[doc_id] = fuzzy_score
|
||||||
})
|
|
||||||
|
|
||||||
# 4. Sortieren
|
# --- PHASE 4: SORTIEREN & ERGEBNISSE HOLEN ---
|
||||||
scored_results.sort(key=lambda x: x["score"], reverse=True)
|
# Sortiere die doc_ids nach dem höchsten Score
|
||||||
|
sorted_doc_ids = sorted(combined_scores.keys(), key=lambda doc_id: combined_scores[doc_id], reverse=True)
|
||||||
|
|
||||||
return [item["data"] for item in scored_results[:50]]
|
# Top 50 Ergebnisse
|
||||||
|
final_results = []
|
||||||
|
for doc_id in sorted_doc_ids[:50]:
|
||||||
|
# Holen der Metadaten für die Anzeige
|
||||||
|
res = cursor.execute(
|
||||||
|
"SELECT filename, path, snippet(documents, 2, '<b>', '</b>', '...', 15) FROM documents WHERE rowid = ?",
|
||||||
|
(doc_id,)
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if res:
|
||||||
|
final_results.append(res)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return final_results
|
||||||
|
|
||||||
# --- 2. INDEXER (Unverändert) ---
|
# --- 2. INDEXER (Unverändert) ---
|
||||||
|
|
||||||
@@ -154,10 +201,11 @@ class IndexerThread(QThread):
|
|||||||
progress_signal = pyqtSignal(str)
|
progress_signal = pyqtSignal(str)
|
||||||
finished_signal = pyqtSignal(int, int, bool)
|
finished_signal = pyqtSignal(int, int, bool)
|
||||||
|
|
||||||
def __init__(self, folder_path, db_name="uff_index.db"):
|
def __init__(self, folder_path, db_name, model):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.folder_path = folder_path
|
self.folder_path = folder_path
|
||||||
self.db_name = db_name
|
self.db_name = db_name
|
||||||
|
self.model = model
|
||||||
self.is_running = True
|
self.is_running = True
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
@@ -182,8 +230,17 @@ class IndexerThread(QThread):
|
|||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
conn = sqlite3.connect(self.db_name)
|
conn = sqlite3.connect(self.db_name)
|
||||||
conn.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
cursor = conn.cursor()
|
||||||
conn.commit()
|
|
||||||
|
# Finde alle doc_ids, die zu dem Ordner gehören, um sie später zu löschen
|
||||||
|
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||||
|
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if ids_to_delete:
|
||||||
|
# Lösche alte Einträge aus 'documents' und 'embeddings'
|
||||||
|
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||||
|
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids_to_delete))})", ids_to_delete)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
indexed = 0
|
indexed = 0
|
||||||
skipped = 0
|
skipped = 0
|
||||||
@@ -203,10 +260,20 @@ class IndexerThread(QThread):
|
|||||||
content = self._extract_text(path)
|
content = self._extract_text(path)
|
||||||
|
|
||||||
if content and len(content.strip()) > 0:
|
if content and len(content.strip()) > 0:
|
||||||
conn.execute(
|
# 1. In FTS-Tabelle einfügen
|
||||||
|
cursor.execute(
|
||||||
"INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)",
|
"INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)",
|
||||||
(file, path, content)
|
(file, path, content)
|
||||||
)
|
)
|
||||||
|
doc_id = cursor.lastrowid
|
||||||
|
|
||||||
|
# 2. Embedding erstellen und in BLOB umwandeln
|
||||||
|
embedding = self.model.encode(content[:8192], convert_to_tensor=False)
|
||||||
|
embedding_blob = embedding.tobytes()
|
||||||
|
|
||||||
|
# 3. Embedding in Tabelle einfügen
|
||||||
|
cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (doc_id, embedding_blob))
|
||||||
|
|
||||||
indexed += 1
|
indexed += 1
|
||||||
else:
|
else:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
@@ -227,13 +294,14 @@ class UffWindow(QMainWindow):
|
|||||||
self.load_saved_folders()
|
self.load_saved_folders()
|
||||||
|
|
||||||
def initUI(self):
|
def initUI(self):
|
||||||
self.setWindowTitle("UFF Text Search v3.0 (Fuzzy)")
|
self.setWindowTitle("UFF Text Search v4.0 (Semantic)")
|
||||||
self.resize(1000, 700)
|
self.resize(1000, 700)
|
||||||
|
|
||||||
central = QWidget()
|
central = QWidget()
|
||||||
self.setCentralWidget(central)
|
self.setCentralWidget(central)
|
||||||
main_layout = QHBoxLayout(central)
|
main_layout = QHBoxLayout(central)
|
||||||
|
|
||||||
|
# ... (UI initialisation remains the same)
|
||||||
# LINKS
|
# LINKS
|
||||||
left_panel = QFrame()
|
left_panel = QFrame()
|
||||||
left_panel.setFixedWidth(250)
|
left_panel.setFixedWidth(250)
|
||||||
@@ -274,7 +342,7 @@ class UffWindow(QMainWindow):
|
|||||||
|
|
||||||
search_container = QHBoxLayout()
|
search_container = QHBoxLayout()
|
||||||
self.input_search = QLineEdit()
|
self.input_search = QLineEdit()
|
||||||
self.input_search.setPlaceholderText("Suchbegriff... (Fuzzy aktiv)")
|
self.input_search.setPlaceholderText("Suchbegriff... (Semantische Suche aktiv)")
|
||||||
self.input_search.returnPressed.connect(self.perform_search)
|
self.input_search.returnPressed.connect(self.perform_search)
|
||||||
self.input_search.setStyleSheet("padding: 8px; font-size: 14px;")
|
self.input_search.setStyleSheet("padding: 8px; font-size: 14px;")
|
||||||
|
|
||||||
@@ -285,7 +353,7 @@ class UffWindow(QMainWindow):
|
|||||||
search_container.addWidget(self.input_search)
|
search_container.addWidget(self.input_search)
|
||||||
search_container.addWidget(btn_go)
|
search_container.addWidget(btn_go)
|
||||||
|
|
||||||
self.lbl_status = QLabel("Bereit.")
|
self.lbl_status = QLabel("Bereit. Semantisches Modell geladen.")
|
||||||
self.lbl_status.setStyleSheet("color: #666;")
|
self.lbl_status.setStyleSheet("color: #666;")
|
||||||
self.progress_bar = QProgressBar()
|
self.progress_bar = QProgressBar()
|
||||||
self.progress_bar.hide()
|
self.progress_bar.hide()
|
||||||
@@ -307,6 +375,8 @@ class UffWindow(QMainWindow):
|
|||||||
|
|
||||||
main_layout.addWidget(splitter)
|
main_layout.addWidget(splitter)
|
||||||
|
|
||||||
|
# ... (Rest of UI Class)
|
||||||
|
|
||||||
# LOGIK
|
# LOGIK
|
||||||
def load_saved_folders(self):
|
def load_saved_folders(self):
|
||||||
self.folder_list.clear()
|
self.folder_list.clear()
|
||||||
@@ -347,10 +417,8 @@ class UffWindow(QMainWindow):
|
|||||||
self.set_ui_busy(True)
|
self.set_ui_busy(True)
|
||||||
self.lbl_status.setText(f"Starte... {os.path.basename(folder)}")
|
self.lbl_status.setText(f"Starte... {os.path.basename(folder)}")
|
||||||
|
|
||||||
# HIER WAR DER FEHLER:
|
# Dem Thread jetzt das Modell mitgeben
|
||||||
# Wir müssen dem Thread explizit sagen, wo die Datenbank liegt!
|
self.indexer_thread = IndexerThread(folder, db_name=self.db.db_name, model=self.db.model)
|
||||||
# self.db.db_name enthält den korrekten Pfad (C:\Users\...\AppData\...)
|
|
||||||
self.indexer_thread = IndexerThread(folder, db_name=self.db.db_name)
|
|
||||||
|
|
||||||
self.indexer_thread.progress_signal.connect(lambda msg: self.lbl_status.setText(msg))
|
self.indexer_thread.progress_signal.connect(lambda msg: self.lbl_status.setText(msg))
|
||||||
self.indexer_thread.finished_signal.connect(self.indexing_finished)
|
self.indexer_thread.finished_signal.connect(self.indexing_finished)
|
||||||
|
|||||||
Reference in New Issue
Block a user