Refactor logging and error handling; implement model loading in a separate thread for improved UI responsiveness
This commit is contained in:
280
uff_app.py
280
uff_app.py
@@ -5,16 +5,20 @@ import pdfplumber
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import zipfile
|
import zipfile
|
||||||
import io
|
import io
|
||||||
from sentence_transformers import SentenceTransformer, util
|
import traceback
|
||||||
|
|
||||||
|
from sentence_transformers import SentenceTransformer, util
|
||||||
from rapidfuzz import process, fuzz
|
from rapidfuzz import process, fuzz
|
||||||
|
|
||||||
|
# Wichtige Importe für UI und Signale
|
||||||
|
from PyQt6.QtCore import qInstallMessageHandler, QtMsgType, Qt, QThread, pyqtSignal, QUrl
|
||||||
from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
|
from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
|
||||||
QHBoxLayout, QLineEdit, QPushButton, QLabel,
|
QHBoxLayout, QLineEdit, QPushButton, QLabel,
|
||||||
QFileDialog, QTextBrowser, QProgressBar, QMessageBox,
|
QFileDialog, QTextBrowser, QProgressBar, QMessageBox,
|
||||||
QListWidget, QListWidgetItem, QSplitter, QFrame)
|
QListWidget, QListWidgetItem, QSplitter, QFrame, QSplashScreen)
|
||||||
from PyQt6.QtCore import Qt, QThread, pyqtSignal, QUrl
|
from PyQt6.QtGui import QDesktopServices, QPixmap
|
||||||
from PyQt6.QtGui import QDesktopServices
|
|
||||||
|
# --- 0. LOGGING & SYSTEM-SETUP ---
|
||||||
|
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
base_dir = os.getenv('LOCALAPPDATA')
|
base_dir = os.getenv('LOCALAPPDATA')
|
||||||
@@ -27,14 +31,14 @@ if not os.path.exists(log_dir):
|
|||||||
|
|
||||||
log_file_path = os.path.join(log_dir, "uff.log")
|
log_file_path = os.path.join(log_dir, "uff.log")
|
||||||
|
|
||||||
# Logger-Klasse, die alles in die Datei schreibt
|
# Logger-Klasse
|
||||||
class Logger(object):
|
class Logger(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.log = open(log_file_path, "w", encoding="utf-8") # "w" überschreibt bei jedem Neustart
|
self.log = open(log_file_path, "w", encoding="utf-8")
|
||||||
|
|
||||||
def write(self, message):
|
def write(self, message):
|
||||||
self.log.write(message)
|
self.log.write(message)
|
||||||
self.log.flush() # Sofort schreiben, damit nichts verloren geht
|
self.log.flush()
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.log.flush()
|
self.log.flush()
|
||||||
@@ -46,11 +50,52 @@ sys.stderr = sys.stdout
|
|||||||
print(f"--- START LOGGING ---")
|
print(f"--- START LOGGING ---")
|
||||||
print(f"Logfile liegt hier: {log_file_path}")
|
print(f"Logfile liegt hier: {log_file_path}")
|
||||||
|
|
||||||
# Font-Warnungen unterdrücken
|
# --- QT MESSAGE HANDLER (Der Filter für C++ Errors) ---
|
||||||
os.environ["QT_LOGGING_RULES"] = "qt.qpa.fonts.warning=false;qt.text.fonts.db.warning=false"
|
def qt_message_handler(mode, context, message):
|
||||||
|
"""
|
||||||
|
Fängt interne Qt-Nachrichten ab und filtert Font-Fehler heraus.
|
||||||
|
"""
|
||||||
|
msg_lower = message.lower()
|
||||||
|
|
||||||
|
# FILTER-LISTE: Erweitert basierend auf deinen Logs
|
||||||
|
ignore_keywords = [
|
||||||
|
"qt.text.font",
|
||||||
|
"qt.qpa.fonts",
|
||||||
|
"opentype support missing",
|
||||||
|
"directwrite",
|
||||||
|
"unable to create font",
|
||||||
|
"fontbbox",
|
||||||
|
"script 66",
|
||||||
|
"script 9",
|
||||||
|
"script 10",
|
||||||
|
"script 20",
|
||||||
|
"script 32"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Wenn eines der Keywords vorkommt -> Nachricht ignorieren (return)
|
||||||
|
if any(keyword in msg_lower for keyword in ignore_keywords):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Formatierung für das Logfile
|
||||||
|
mode_str = "INFO"
|
||||||
|
if mode == QtMsgType.QtWarningMsg: mode_str = "WARNING"
|
||||||
|
elif mode == QtMsgType.QtCriticalMsg: mode_str = "CRITICAL"
|
||||||
|
elif mode == QtMsgType.QtFatalMsg: mode_str = "FATAL"
|
||||||
|
|
||||||
|
# Nur relevante Nachrichten ins Log schreiben
|
||||||
|
try:
|
||||||
|
sys.stdout.write(f"[Qt {mode_str}] {message}\n")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Handler installieren (Muss VOR der App-Erstellung passieren)
|
||||||
|
qInstallMessageHandler(qt_message_handler)
|
||||||
|
|
||||||
|
# Zusätzlich Environment Variable setzen
|
||||||
|
os.environ["QT_LOGGING_RULES"] = "qt.text.font.db=false;qt.qpa.fonts=false"
|
||||||
|
|
||||||
|
|
||||||
# --- 1. DATENBANK MANAGER (Mit Hybrid Search Scoring) ---
|
# --- 1. DATENBANK MANAGER ---
|
||||||
|
|
||||||
class DatabaseHandler:
|
class DatabaseHandler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -65,31 +110,24 @@ class DatabaseHandler:
|
|||||||
os.makedirs(self.app_data_dir)
|
os.makedirs(self.app_data_dir)
|
||||||
|
|
||||||
self.db_name = os.path.join(self.app_data_dir, "uff_index.db")
|
self.db_name = os.path.join(self.app_data_dir, "uff_index.db")
|
||||||
|
|
||||||
print(f"Datenbank Pfad: {self.db_name}")
|
print(f"Datenbank Pfad: {self.db_name}")
|
||||||
|
self.model = None
|
||||||
print("Lade das semantische Modell (all-MiniLM-L6-v2)...")
|
|
||||||
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
||||||
print("Modell geladen.")
|
|
||||||
|
|
||||||
self.init_db()
|
self.init_db()
|
||||||
|
|
||||||
def init_db(self):
|
def init_db(self):
|
||||||
conn = sqlite3.connect(self.db_name)
|
conn = sqlite3.connect(self.db_name)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
# FTS-Tabelle für die Stichwortsuche
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE VIRTUAL TABLE IF NOT EXISTS documents
|
CREATE VIRTUAL TABLE IF NOT EXISTS documents
|
||||||
USING fts5(filename, path, content);
|
USING fts5(filename, path, content);
|
||||||
""")
|
""")
|
||||||
# Tabelle für die Ordner
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS folders (
|
CREATE TABLE IF NOT EXISTS folders (
|
||||||
path TEXT PRIMARY KEY,
|
path TEXT PRIMARY KEY,
|
||||||
alias TEXT
|
alias TEXT
|
||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
# Tabelle für die Vektor-Embeddings
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS embeddings (
|
CREATE TABLE IF NOT EXISTS embeddings (
|
||||||
doc_id INTEGER PRIMARY KEY,
|
doc_id INTEGER PRIMARY KEY,
|
||||||
@@ -115,11 +153,9 @@ class DatabaseHandler:
|
|||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
|
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
|
||||||
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
if ids_to_delete:
|
if ids_to_delete:
|
||||||
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
|
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
|
||||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids_to_delete))})", ids_to_delete)
|
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids_to_delete))})", ids_to_delete)
|
||||||
|
|
||||||
cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
|
cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
@@ -131,43 +167,32 @@ class DatabaseHandler:
|
|||||||
return [r[0] for r in rows]
|
return [r[0] for r in rows]
|
||||||
|
|
||||||
def search(self, query):
|
def search(self, query):
|
||||||
if not query.strip(): return []
|
if not query.strip() or not self.model: return []
|
||||||
|
|
||||||
# --- PHASE 1: SEMANTISCHE SUCHE (Vektor) ---
|
# PHASE 1: SEMANTIK
|
||||||
query_embedding = self.model.encode(query, convert_to_tensor=False)
|
query_embedding = self.model.encode(query, convert_to_tensor=False)
|
||||||
|
|
||||||
conn = sqlite3.connect(self.db_name)
|
conn = sqlite3.connect(self.db_name)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT doc_id, vec FROM embeddings")
|
cursor.execute("SELECT doc_id, vec FROM embeddings")
|
||||||
all_embeddings_data = cursor.fetchall()
|
all_embeddings_data = cursor.fetchall()
|
||||||
|
|
||||||
doc_ids = [item[0] for item in all_embeddings_data]
|
doc_ids = [item[0] for item in all_embeddings_data]
|
||||||
|
|
||||||
if not doc_ids:
|
if not doc_ids:
|
||||||
conn.close()
|
conn.close()
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# BLOBs zurück zu Vektoren
|
|
||||||
all_embeddings = np.array([np.frombuffer(item[1], dtype=np.float32) for item in all_embeddings_data])
|
all_embeddings = np.array([np.frombuffer(item[1], dtype=np.float32) for item in all_embeddings_data])
|
||||||
|
|
||||||
# Cosine Similarity (Werte zwischen -1 und 1)
|
|
||||||
# clip auf 0, da negative Werte hier irrelevant sind
|
|
||||||
cos_scores = util.cos_sim(query_embedding, all_embeddings)[0].numpy()
|
cos_scores = util.cos_sim(query_embedding, all_embeddings)[0].numpy()
|
||||||
cos_scores = np.clip(cos_scores, 0, 1)
|
cos_scores = np.clip(cos_scores, 0, 1)
|
||||||
|
|
||||||
# Map: doc_id -> Semantic Score (0.0 - 1.0)
|
|
||||||
semantic_map = {doc_id: float(score) for doc_id, score in zip(doc_ids, cos_scores)}
|
semantic_map = {doc_id: float(score) for doc_id, score in zip(doc_ids, cos_scores)}
|
||||||
|
|
||||||
# --- PHASE 2: STICHWORTSUCHE (FTS & Fuzzy) ---
|
# PHASE 2: LEXIKALISCH
|
||||||
words = query.replace('"', '').split()
|
words = query.replace('"', '').split()
|
||||||
if not words: words = [query]
|
if not words: words = [query]
|
||||||
|
|
||||||
sql_query_parts = [f'"{w}"*' for w in words]
|
sql_query_parts = [f'"{w}"*' for w in words]
|
||||||
sql_query_string = " OR ".join(sql_query_parts)
|
sql_query_string = " OR ".join(sql_query_parts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Wir holen Kandidaten, die die Wörter enthalten
|
|
||||||
fts_rows = cursor.execute("""
|
fts_rows = cursor.execute("""
|
||||||
SELECT rowid, filename, content
|
SELECT rowid, filename, content
|
||||||
FROM documents
|
FROM documents
|
||||||
@@ -178,55 +203,53 @@ class DatabaseHandler:
|
|||||||
fts_rows = []
|
fts_rows = []
|
||||||
|
|
||||||
lexical_map = {}
|
lexical_map = {}
|
||||||
|
|
||||||
for doc_id, filename, content in fts_rows:
|
for doc_id, filename, content in fts_rows:
|
||||||
# Fuzzy-Score berechnen (0 bis 100) -> normalisieren auf 0.0 - 1.0
|
|
||||||
ratio_name = fuzz.partial_ratio(query.lower(), filename.lower())
|
ratio_name = fuzz.partial_ratio(query.lower(), filename.lower())
|
||||||
ratio_content = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower())
|
ratio_content = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower())
|
||||||
|
|
||||||
best_ratio = max(ratio_name, ratio_content)
|
best_ratio = max(ratio_name, ratio_content)
|
||||||
lexical_map[doc_id] = best_ratio / 100.0
|
lexical_map[doc_id] = best_ratio / 100.0
|
||||||
|
|
||||||
# --- PHASE 3: HYBRID FUSION (Kombination) ---
|
# PHASE 3: HYBRID
|
||||||
final_scores = {}
|
final_scores = {}
|
||||||
|
ALPHA = 0.65
|
||||||
# Gewichtung anpassen
|
BETA = 0.35
|
||||||
ALPHA = 0.65 # 65% Semantik
|
|
||||||
BETA = 0.35 # 35% Stichwort
|
|
||||||
|
|
||||||
for doc_id, sem_score in semantic_map.items():
|
for doc_id, sem_score in semantic_map.items():
|
||||||
# Filter: Nur Ergebnisse mit minimaler Relevanz betrachten
|
|
||||||
if sem_score < 0.15 and doc_id not in lexical_map:
|
if sem_score < 0.15 and doc_id not in lexical_map:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
lex_score = lexical_map.get(doc_id, 0.0)
|
lex_score = lexical_map.get(doc_id, 0.0)
|
||||||
|
|
||||||
# Hybrid Score
|
|
||||||
hybrid_score = (sem_score * ALPHA) + (lex_score * BETA)
|
hybrid_score = (sem_score * ALPHA) + (lex_score * BETA)
|
||||||
|
|
||||||
# Bonus: Wenn beides hoch ist (Semantik UND Keyword)
|
|
||||||
if sem_score > 0.4 and lex_score > 0.6:
|
if sem_score > 0.4 and lex_score > 0.6:
|
||||||
hybrid_score += 0.1
|
hybrid_score += 0.1
|
||||||
|
|
||||||
final_scores[doc_id] = hybrid_score
|
final_scores[doc_id] = hybrid_score
|
||||||
|
|
||||||
# --- PHASE 4: SORTIEREN & AUSGEBEN ---
|
# PHASE 4: SORT
|
||||||
sorted_ids = sorted(final_scores.keys(), key=lambda x: final_scores[x], reverse=True)
|
sorted_ids = sorted(final_scores.keys(), key=lambda x: final_scores[x], reverse=True)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for doc_id in sorted_ids[:50]: # Top 50 Ergebnisse
|
for doc_id in sorted_ids[:50]:
|
||||||
row = cursor.execute(
|
row = cursor.execute(
|
||||||
"SELECT filename, path, snippet(documents, 2, '<b>', '</b>', '...', 15) FROM documents WHERE rowid = ?",
|
"SELECT filename, path, snippet(documents, 2, '<b>', '</b>', '...', 15) FROM documents WHERE rowid = ?",
|
||||||
(doc_id,)
|
(doc_id,)
|
||||||
).fetchone()
|
).fetchone()
|
||||||
if row:
|
if row:
|
||||||
results.append(row)
|
results.append(row)
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# --- 2. INDEXER (Mit ZIP Support & Recursion) ---
|
# --- 2. MODEL LOADER ---
|
||||||
|
class ModelLoaderThread(QThread):
|
||||||
|
model_loaded = pyqtSignal(object)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
print("Lade das semantische Modell (all-MiniLM-L6-v2)...")
|
||||||
|
try:
|
||||||
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
print("Modell geladen.")
|
||||||
|
self.model_loaded.emit(model)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler beim Laden des Modells: {e}")
|
||||||
|
self.model_loaded.emit(None)
|
||||||
|
|
||||||
|
# --- 3. INDEXER ---
|
||||||
class IndexerThread(QThread):
|
class IndexerThread(QThread):
|
||||||
progress_signal = pyqtSignal(str)
|
progress_signal = pyqtSignal(str)
|
||||||
finished_signal = pyqtSignal(int, int, bool)
|
finished_signal = pyqtSignal(int, int, bool)
|
||||||
@@ -242,58 +265,41 @@ class IndexerThread(QThread):
|
|||||||
self.is_running = False
|
self.is_running = False
|
||||||
|
|
||||||
def _extract_text_from_stream(self, file_stream, filename):
|
def _extract_text_from_stream(self, file_stream, filename):
|
||||||
"""
|
|
||||||
Liest Text aus einem Dateiobjekt (Stream) oder Pfad, basierend auf der Endung.
|
|
||||||
Robuster gegen defekte PDF-Seiten.
|
|
||||||
"""
|
|
||||||
ext = os.path.splitext(filename)[1].lower()
|
ext = os.path.splitext(filename)[1].lower()
|
||||||
text = ""
|
text = ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if ext == ".pdf":
|
if ext == ".pdf":
|
||||||
# pdfplumber kann direkt Dateiobjekte (BytesIO) lesen
|
|
||||||
try:
|
try:
|
||||||
with pdfplumber.open(file_stream) as pdf:
|
with pdfplumber.open(file_stream) as pdf:
|
||||||
for page in pdf.pages:
|
for page in pdf.pages:
|
||||||
try:
|
try:
|
||||||
# Versuch, Text von der einzelnen Seite zu holen
|
|
||||||
if page_text := page.extract_text():
|
if page_text := page.extract_text():
|
||||||
text += page_text + "\n"
|
text += page_text + "\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Wenn eine Seite defekt ist (z.B. FontBBox Fehler), überspringen wir nur diese Seite
|
print(f"Warnung: Konnte eine Seite in '{filename}' nicht lesen. Fehler: {e}")
|
||||||
print(f"Warnung: Konnte eine Seite in '{filename}' nicht lesen (übersprungen). Fehler: {e}")
|
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Wenn die ganze PDF nicht geöffnet werden kann
|
print(f"Warnung: PDF '{filename}' defekt. Fehler: {e}")
|
||||||
print(f"Warnung: PDF '{filename}' konnte nicht geöffnet werden. Fehler: {e}")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
|
elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
|
||||||
# Wir lesen die Bytes und decodieren sie
|
|
||||||
if hasattr(file_stream, 'read'):
|
if hasattr(file_stream, 'read'):
|
||||||
content_bytes = file_stream.read()
|
content_bytes = file_stream.read()
|
||||||
if isinstance(content_bytes, str):
|
if isinstance(content_bytes, str):
|
||||||
# Fallback
|
|
||||||
with open(file_stream, 'r', encoding='utf-8', errors='ignore') as f:
|
with open(file_stream, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
else:
|
else:
|
||||||
text = content_bytes.decode('utf-8', errors='ignore')
|
text = content_bytes.decode('utf-8', errors='ignore')
|
||||||
else:
|
else:
|
||||||
# Echter Dateipfad
|
|
||||||
with open(file_stream, "r", encoding="utf-8", errors="ignore") as f:
|
with open(file_stream, "r", encoding="utf-8", errors="ignore") as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Allgemeiner Fehler beim Lesen
|
|
||||||
# print(f"Lese-Fehler bei {filename}: {e}")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
conn = sqlite3.connect(self.db_name)
|
conn = sqlite3.connect(self.db_name)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Bereinigen alter Einträge
|
|
||||||
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||||
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
||||||
if ids_to_delete:
|
if ids_to_delete:
|
||||||
@@ -305,7 +311,6 @@ class IndexerThread(QThread):
|
|||||||
skipped = 0
|
skipped = 0
|
||||||
was_cancelled = False
|
was_cancelled = False
|
||||||
|
|
||||||
# --- REKURSIVES DURCHSUCHEN ---
|
|
||||||
for root, dirs, files in os.walk(self.folder_path):
|
for root, dirs, files in os.walk(self.folder_path):
|
||||||
if not self.is_running:
|
if not self.is_running:
|
||||||
was_cancelled = True
|
was_cancelled = True
|
||||||
@@ -319,30 +324,21 @@ class IndexerThread(QThread):
|
|||||||
file_path = os.path.join(root, file)
|
file_path = os.path.join(root, file)
|
||||||
self.progress_signal.emit(f"Prüfe: {file}...")
|
self.progress_signal.emit(f"Prüfe: {file}...")
|
||||||
|
|
||||||
# A. ZIP-DATEIEN BEHANDELN
|
|
||||||
if file.lower().endswith('.zip'):
|
if file.lower().endswith('.zip'):
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(file_path, 'r') as z:
|
with zipfile.ZipFile(file_path, 'r') as z:
|
||||||
for z_info in z.infolist():
|
for z_info in z.infolist():
|
||||||
if z_info.is_dir(): continue
|
if z_info.is_dir(): continue
|
||||||
|
|
||||||
# Virtueller Pfad: C:\Ordner\Archiv.zip :: innen/datei.txt
|
|
||||||
virtual_path = f"{file_path} :: {z_info.filename}"
|
virtual_path = f"{file_path} :: {z_info.filename}"
|
||||||
|
|
||||||
with z.open(z_info) as z_file:
|
with z.open(z_info) as z_file:
|
||||||
# Inhalt in RAM laden (BytesIO)
|
|
||||||
file_in_memory = io.BytesIO(z_file.read())
|
file_in_memory = io.BytesIO(z_file.read())
|
||||||
|
|
||||||
content = self._extract_text_from_stream(file_in_memory, z_info.filename)
|
content = self._extract_text_from_stream(file_in_memory, z_info.filename)
|
||||||
|
|
||||||
if content and len(content.strip()) > 20:
|
if content and len(content.strip()) > 20:
|
||||||
self._save_to_db(cursor, z_info.filename, virtual_path, content)
|
self._save_to_db(cursor, z_info.filename, virtual_path, content)
|
||||||
indexed += 1
|
indexed += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Zip Error {file}: {e}")
|
print(f"Zip Error {file}: {e}")
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
|
||||||
# B. NORMALE DATEIEN
|
|
||||||
else:
|
else:
|
||||||
content = self._extract_text_from_stream(file_path, file)
|
content = self._extract_text_from_stream(file_path, file)
|
||||||
if content and len(content.strip()) > 20:
|
if content and len(content.strip()) > 20:
|
||||||
@@ -358,32 +354,26 @@ class IndexerThread(QThread):
|
|||||||
self.finished_signal.emit(indexed, skipped, was_cancelled)
|
self.finished_signal.emit(indexed, skipped, was_cancelled)
|
||||||
|
|
||||||
def _save_to_db(self, cursor, filename, path, content):
|
def _save_to_db(self, cursor, filename, path, content):
|
||||||
# 1. Text speichern
|
cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (filename, path, content))
|
||||||
cursor.execute(
|
|
||||||
"INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)",
|
|
||||||
(filename, path, content)
|
|
||||||
)
|
|
||||||
doc_id = cursor.lastrowid
|
doc_id = cursor.lastrowid
|
||||||
|
|
||||||
# 2. Embedding erstellen (Max 8000 chars)
|
|
||||||
embedding = self.model.encode(content[:8000], convert_to_tensor=False)
|
embedding = self.model.encode(content[:8000], convert_to_tensor=False)
|
||||||
embedding_blob = embedding.tobytes()
|
embedding_blob = embedding.tobytes()
|
||||||
|
|
||||||
# 3. Vektor speichern
|
|
||||||
cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (doc_id, embedding_blob))
|
cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (doc_id, embedding_blob))
|
||||||
|
|
||||||
# --- 3. UI (Unverändert) ---
|
|
||||||
|
# --- 4. UI ---
|
||||||
|
|
||||||
class UffWindow(QMainWindow):
|
class UffWindow(QMainWindow):
|
||||||
def __init__(self):
|
def __init__(self, splash=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.splash = splash
|
||||||
self.db = DatabaseHandler()
|
self.db = DatabaseHandler()
|
||||||
self.indexer_thread = None
|
self.indexer_thread = None
|
||||||
self.initUI()
|
self.initUI()
|
||||||
self.load_saved_folders()
|
self.load_saved_folders()
|
||||||
|
|
||||||
def initUI(self):
|
def initUI(self):
|
||||||
self.setWindowTitle("UFF Text Search v5.0 (Hybrid Zip)")
|
self.setWindowTitle("UFF Text Search")
|
||||||
self.resize(1000, 700)
|
self.resize(1000, 700)
|
||||||
|
|
||||||
central = QWidget()
|
central = QWidget()
|
||||||
@@ -395,22 +385,18 @@ class UffWindow(QMainWindow):
|
|||||||
left_panel.setFixedWidth(250)
|
left_panel.setFixedWidth(250)
|
||||||
left_layout = QVBoxLayout(left_panel)
|
left_layout = QVBoxLayout(left_panel)
|
||||||
left_layout.setContentsMargins(0, 0, 0, 0)
|
left_layout.setContentsMargins(0, 0, 0, 0)
|
||||||
|
|
||||||
lbl_folders = QLabel("📂 Meine Ordner")
|
lbl_folders = QLabel("📂 Meine Ordner")
|
||||||
lbl_folders.setStyleSheet("font-weight: bold; font-size: 14px;")
|
lbl_folders.setStyleSheet("font-weight: bold; font-size: 14px;")
|
||||||
|
|
||||||
self.folder_list = QListWidget()
|
self.folder_list = QListWidget()
|
||||||
self.folder_list.setSelectionMode(QListWidget.SelectionMode.SingleSelection)
|
self.folder_list.setSelectionMode(QListWidget.SelectionMode.SingleSelection)
|
||||||
|
|
||||||
btn_add = QPushButton(" + Hinzufügen")
|
self.btn_add = QPushButton(" + Hinzufügen")
|
||||||
btn_add.clicked.connect(self.add_new_folder)
|
self.btn_add.clicked.connect(self.add_new_folder)
|
||||||
|
self.btn_remove = QPushButton(" - Entfernen")
|
||||||
btn_remove = QPushButton(" - Entfernen")
|
self.btn_remove.clicked.connect(self.delete_selected_folder)
|
||||||
btn_remove.clicked.connect(self.delete_selected_folder)
|
|
||||||
|
|
||||||
self.btn_rescan = QPushButton(" ↻ Neu scannen")
|
self.btn_rescan = QPushButton(" ↻ Neu scannen")
|
||||||
self.btn_rescan.clicked.connect(self.rescan_selected_folder)
|
self.btn_rescan.clicked.connect(self.rescan_selected_folder)
|
||||||
|
|
||||||
self.btn_cancel = QPushButton("🛑 Abbrechen")
|
self.btn_cancel = QPushButton("🛑 Abbrechen")
|
||||||
self.btn_cancel.setStyleSheet("background-color: #ffcccc; color: #cc0000; font-weight: bold;")
|
self.btn_cancel.setStyleSheet("background-color: #ffcccc; color: #cc0000; font-weight: bold;")
|
||||||
self.btn_cancel.clicked.connect(self.cancel_indexing)
|
self.btn_cancel.clicked.connect(self.cancel_indexing)
|
||||||
@@ -418,8 +404,8 @@ class UffWindow(QMainWindow):
|
|||||||
|
|
||||||
left_layout.addWidget(lbl_folders)
|
left_layout.addWidget(lbl_folders)
|
||||||
left_layout.addWidget(self.folder_list)
|
left_layout.addWidget(self.folder_list)
|
||||||
left_layout.addWidget(btn_add)
|
left_layout.addWidget(self.btn_add)
|
||||||
left_layout.addWidget(btn_remove)
|
left_layout.addWidget(self.btn_remove)
|
||||||
left_layout.addStretch()
|
left_layout.addStretch()
|
||||||
left_layout.addWidget(self.btn_rescan)
|
left_layout.addWidget(self.btn_rescan)
|
||||||
left_layout.addWidget(self.btn_cancel)
|
left_layout.addWidget(self.btn_cancel)
|
||||||
@@ -434,21 +420,25 @@ class UffWindow(QMainWindow):
|
|||||||
self.input_search.returnPressed.connect(self.perform_search)
|
self.input_search.returnPressed.connect(self.perform_search)
|
||||||
self.input_search.setStyleSheet("padding: 8px; font-size: 14px;")
|
self.input_search.setStyleSheet("padding: 8px; font-size: 14px;")
|
||||||
|
|
||||||
btn_go = QPushButton("Suchen")
|
self.btn_go = QPushButton("Suchen")
|
||||||
btn_go.setFixedWidth(100)
|
self.btn_go.setFixedWidth(100)
|
||||||
btn_go.clicked.connect(self.perform_search)
|
self.btn_go.clicked.connect(self.perform_search)
|
||||||
|
|
||||||
search_container.addWidget(self.input_search)
|
search_container.addWidget(self.input_search)
|
||||||
search_container.addWidget(btn_go)
|
search_container.addWidget(self.btn_go)
|
||||||
|
|
||||||
self.lbl_status = QLabel("Bereit. Hybrid-Modell geladen.")
|
self.lbl_status = QLabel("Initialisiere...")
|
||||||
self.lbl_status.setStyleSheet("color: #666;")
|
self.lbl_status.setStyleSheet("color: #666;")
|
||||||
self.progress_bar = QProgressBar()
|
self.progress_bar = QProgressBar()
|
||||||
self.progress_bar.hide()
|
self.progress_bar.hide()
|
||||||
|
|
||||||
|
# STANDARD BROWSER MIT RICHTIGEN EINSTELLUNGEN
|
||||||
self.result_browser = QTextBrowser()
|
self.result_browser = QTextBrowser()
|
||||||
self.result_browser.setOpenExternalLinks(False)
|
# WICHTIG: Interne Links deaktivieren, damit wir sie abfangen können
|
||||||
self.result_browser.anchorClicked.connect(self.link_clicked)
|
self.result_browser.setOpenExternalLinks(False)
|
||||||
|
# Wenn wir darauf klicken, wird unser Slot aufgerufen
|
||||||
|
self.result_browser.anchorClicked.connect(self.link_clicked)
|
||||||
|
|
||||||
self.result_browser.setStyleSheet("background-color: white; border: 1px solid #ccc;")
|
self.result_browser.setStyleSheet("background-color: white; border: 1px solid #ccc;")
|
||||||
|
|
||||||
right_layout.addLayout(search_container)
|
right_layout.addLayout(search_container)
|
||||||
@@ -462,8 +452,39 @@ class UffWindow(QMainWindow):
|
|||||||
splitter.setSizes([250, 750])
|
splitter.setSizes([250, 750])
|
||||||
|
|
||||||
main_layout.addWidget(splitter)
|
main_layout.addWidget(splitter)
|
||||||
|
self.set_main_ui_enabled(False)
|
||||||
|
|
||||||
|
def set_main_ui_enabled(self, enabled):
|
||||||
|
self.input_search.setEnabled(enabled)
|
||||||
|
self.btn_go.setEnabled(enabled)
|
||||||
|
self.folder_list.setEnabled(enabled)
|
||||||
|
self.btn_add.setEnabled(enabled)
|
||||||
|
self.btn_remove.setEnabled(enabled)
|
||||||
|
self.btn_rescan.setEnabled(enabled)
|
||||||
|
|
||||||
|
def start_model_loading(self):
|
||||||
|
if self.splash:
|
||||||
|
self.splash.showMessage("Lade semantisches Modell...", Qt.AlignmentFlag.AlignBottom | Qt.AlignmentFlag.AlignHCenter, Qt.GlobalColor.white)
|
||||||
|
self.model_loader = ModelLoaderThread()
|
||||||
|
self.model_loader.model_loaded.connect(self.on_model_loaded)
|
||||||
|
self.model_loader.start()
|
||||||
|
|
||||||
|
def on_model_loaded(self, model):
|
||||||
|
if self.splash:
|
||||||
|
self.splash.showMessage("Modell geladen. Starte Benutzeroberfläche...", Qt.AlignmentFlag.AlignBottom | Qt.AlignmentFlag.AlignHCenter, Qt.GlobalColor.white)
|
||||||
|
|
||||||
|
if model is None:
|
||||||
|
self.lbl_status.setText("Fehler: Modell konnte nicht geladen werden.")
|
||||||
|
QMessageBox.critical(self, "Kritischer Fehler", "Das semantische Modell konnte nicht geladen werden.")
|
||||||
|
self.close()
|
||||||
|
else:
|
||||||
|
self.db.model = model
|
||||||
|
self.lbl_status.setText("Bereit. Hybrid-Modell geladen.")
|
||||||
|
self.set_main_ui_enabled(True)
|
||||||
|
|
||||||
|
if self.splash:
|
||||||
|
self.splash.finish(self)
|
||||||
|
|
||||||
# LOGIK
|
|
||||||
def load_saved_folders(self):
|
def load_saved_folders(self):
|
||||||
self.folder_list.clear()
|
self.folder_list.clear()
|
||||||
folders = self.db.get_folders()
|
folders = self.db.get_folders()
|
||||||
@@ -500,11 +521,14 @@ class UffWindow(QMainWindow):
|
|||||||
self.start_indexing(item.text())
|
self.start_indexing(item.text())
|
||||||
|
|
||||||
def start_indexing(self, folder):
|
def start_indexing(self, folder):
|
||||||
|
if not self.db.model:
|
||||||
|
QMessageBox.warning(self, "Bitte warten", "Das Suchmodell wird noch geladen.")
|
||||||
|
return
|
||||||
|
|
||||||
self.set_ui_busy(True)
|
self.set_ui_busy(True)
|
||||||
self.lbl_status.setText(f"Starte... {os.path.basename(folder)}")
|
self.lbl_status.setText(f"Starte... {os.path.basename(folder)}")
|
||||||
|
|
||||||
self.indexer_thread = IndexerThread(folder, db_name=self.db.db_name, model=self.db.model)
|
self.indexer_thread = IndexerThread(folder, db_name=self.db.db_name, model=self.db.model)
|
||||||
|
|
||||||
self.indexer_thread.progress_signal.connect(lambda msg: self.lbl_status.setText(msg))
|
self.indexer_thread.progress_signal.connect(lambda msg: self.lbl_status.setText(msg))
|
||||||
self.indexer_thread.finished_signal.connect(self.indexing_finished)
|
self.indexer_thread.finished_signal.connect(self.indexing_finished)
|
||||||
self.indexer_thread.start()
|
self.indexer_thread.start()
|
||||||
@@ -526,6 +550,9 @@ class UffWindow(QMainWindow):
|
|||||||
def set_ui_busy(self, busy):
|
def set_ui_busy(self, busy):
|
||||||
self.input_search.setEnabled(not busy)
|
self.input_search.setEnabled(not busy)
|
||||||
self.folder_list.setEnabled(not busy)
|
self.folder_list.setEnabled(not busy)
|
||||||
|
self.btn_add.setEnabled(not busy)
|
||||||
|
self.btn_remove.setEnabled(not busy)
|
||||||
|
self.btn_go.setEnabled(not busy)
|
||||||
self.btn_rescan.setVisible(not busy)
|
self.btn_rescan.setVisible(not busy)
|
||||||
self.btn_cancel.setVisible(busy)
|
self.btn_cancel.setVisible(busy)
|
||||||
if busy:
|
if busy:
|
||||||
@@ -549,15 +576,14 @@ class UffWindow(QMainWindow):
|
|||||||
html = "<h3 style='color: gray; text-align: center; margin-top: 20px;'>Nichts gefunden.</h3>"
|
html = "<h3 style='color: gray; text-align: center; margin-top: 20px;'>Nichts gefunden.</h3>"
|
||||||
|
|
||||||
for filename, filepath, snippet in results:
|
for filename, filepath, snippet in results:
|
||||||
# Falls es eine Datei im Zip ist, müssen wir den Link anpassen,
|
|
||||||
# damit er zumindest das Zip öffnet.
|
|
||||||
if " :: " in filepath:
|
if " :: " in filepath:
|
||||||
real_path = filepath.split(" :: ")[0]
|
real_path = filepath.split(" :: ")[0]
|
||||||
display_path = filepath # Zeige den virtuellen Pfad
|
display_path = filepath
|
||||||
else:
|
else:
|
||||||
real_path = filepath
|
real_path = filepath
|
||||||
display_path = filepath
|
display_path = filepath
|
||||||
|
|
||||||
|
# Link für QTextBrowser
|
||||||
file_url = QUrl.fromLocalFile(real_path).toString()
|
file_url = QUrl.fromLocalFile(real_path).toString()
|
||||||
|
|
||||||
html += f"""
|
html += f"""
|
||||||
@@ -571,11 +597,27 @@ class UffWindow(QMainWindow):
|
|||||||
"""
|
"""
|
||||||
self.result_browser.setHtml(html)
|
self.result_browser.setHtml(html)
|
||||||
|
|
||||||
|
# --- DIE FUNKTION ZUM ÖFFNEN DER LINKS ---
|
||||||
def link_clicked(self, url):
|
def link_clicked(self, url):
|
||||||
|
print(f"Versuche zu öffnen: {url.toString()}")
|
||||||
QDesktopServices.openUrl(url)
|
QDesktopServices.openUrl(url)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app = QApplication(sys.argv)
|
app = QApplication(sys.argv)
|
||||||
window = UffWindow()
|
|
||||||
|
splash = None
|
||||||
|
try:
|
||||||
|
pixmap = QPixmap("assets/uff_banner.jpeg")
|
||||||
|
splash = QSplashScreen(pixmap)
|
||||||
|
splash.show()
|
||||||
|
splash.showMessage("Initialisiere Anwendung...", Qt.AlignmentFlag.AlignBottom | Qt.AlignmentFlag.AlignHCenter, Qt.GlobalColor.white)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
app.processEvents()
|
||||||
|
|
||||||
|
window = UffWindow(splash)
|
||||||
window.show()
|
window.show()
|
||||||
|
window.start_model_loading()
|
||||||
|
|
||||||
sys.exit(app.exec())
|
sys.exit(app.exec())
|
||||||
Reference in New Issue
Block a user