From 81c7b0060f64cf5c3b656a25adfffaf4b3873caa Mon Sep 17 00:00:00 2001 From: Konstantin Date: Fri, 9 Jan 2026 16:09:33 +0100 Subject: [PATCH] Remove outdated UFF-Search.spec file and update requirements.txt to replace pypdf with pdfplumber and pdfminer.six; enhance scoring logic in uff_app.py for improved search accuracy. --- UFF-Search.spec | 38 -------------------------------------- requirements.txt | 3 ++- uff_app.py | 24 +++++++++++++----------- 3 files changed, 15 insertions(+), 50 deletions(-) delete mode 100644 UFF-Search.spec diff --git a/UFF-Search.spec b/UFF-Search.spec deleted file mode 100644 index 1563e28..0000000 --- a/UFF-Search.spec +++ /dev/null @@ -1,38 +0,0 @@ -# -*- mode: python ; coding: utf-8 -*- - - -a = Analysis( - ['uff_app.py'], - pathex=[], - binaries=[], - datas=[], - hiddenimports=['rapidfuzz', 'pypdf'], - hookspath=[], - hooksconfig={}, - runtime_hooks=[], - excludes=[], - noarchive=False, - optimize=0, -) -pyz = PYZ(a.pure) - -exe = EXE( - pyz, - a.scripts, - a.binaries, - a.datas, - [], - name='UFF-Search', - debug=False, - bootloader_ignore_signals=False, - strip=False, - upx=True, - upx_exclude=[], - runtime_tmpdir=None, - console=False, - disable_windowed_traceback=False, - argv_emulation=False, - target_arch=None, - codesign_identity=None, - entitlements_file=None, -) diff --git a/requirements.txt b/requirements.txt index 6bb09ce..392d738 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -pypdf +pdfplumber +pdfminer.six rapidfuzz PyQt6 \ No newline at end of file diff --git a/uff_app.py b/uff_app.py index b83d239..7b995ef 100644 --- a/uff_app.py +++ b/uff_app.py @@ -1,7 +1,7 @@ import sys import os import sqlite3 -from pypdf import PdfReader +import pdfplumber # NEU: Für die Fuzzy-Logik from rapidfuzz import process, fuzz @@ -120,19 +120,20 @@ class DatabaseHandler: scored_results = [] for filename, path, snippet, content in rows: - # Wir berechnen Scores - score_name = fuzz.partial_ratio(query.lower(), filename.lower()) + # Wir berechnen Scores mit besserer Gewichtung + score_name = fuzz.WRatio(query.lower(), filename.lower()) # Content-Check: Wir nehmen Content (falls snippet zu kurz ist) # Begrenzung auf die ersten 5000 Zeichen für Performance check_content = content[:5000] if content else "" score_content = fuzz.partial_token_set_ratio(query.lower(), check_content.lower()) - final_score = max(score_name, score_content) + # Gewichteter Durchschnitt: Inhalt ist wichtiger als Dateiname + final_score = (score_name * 0.2) + (score_content * 0.8) - # Bonus für exakte Wort-Treffer + # Bonus für exakte Wort-Treffer (jetzt stärker) if all(w.lower() in (filename + check_content).lower() for w in words): - final_score += 10 + final_score += 20 # Filter: Nur anzeigen, wenn Score halbwegs okay ist # Bei "vertraaag" vs "vertrag" ist der Score meist > 70 @@ -166,11 +167,12 @@ class IndexerThread(QThread): ext = os.path.splitext(filepath)[1].lower() try: if ext == ".pdf": - reader = PdfReader(filepath) - text = "" - for page in reader.pages: - if page_text := page.extract_text(): text += page_text + "\n" - return text + with pdfplumber.open(filepath) as pdf: + text = "" + for page in pdf.pages: + if page_text := page.extract_text(): + text += page_text + "\n" + return text elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]: with open(filepath, "r", encoding="utf-8", errors="ignore") as f: return f.read()