Remove outdated UFF-Search.spec file and update requirements.txt to replace pypdf with pdfplumber and pdfminer.six; enhance scoring logic in uff_app.py for improved search accuracy.
This commit is contained in:
@@ -1,38 +0,0 @@
|
||||
# -*- mode: python ; coding: utf-8 -*-
|
||||
|
||||
|
||||
a = Analysis(
|
||||
['uff_app.py'],
|
||||
pathex=[],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
hiddenimports=['rapidfuzz', 'pypdf'],
|
||||
hookspath=[],
|
||||
hooksconfig={},
|
||||
runtime_hooks=[],
|
||||
excludes=[],
|
||||
noarchive=False,
|
||||
optimize=0,
|
||||
)
|
||||
pyz = PYZ(a.pure)
|
||||
|
||||
exe = EXE(
|
||||
pyz,
|
||||
a.scripts,
|
||||
a.binaries,
|
||||
a.datas,
|
||||
[],
|
||||
name='UFF-Search',
|
||||
debug=False,
|
||||
bootloader_ignore_signals=False,
|
||||
strip=False,
|
||||
upx=True,
|
||||
upx_exclude=[],
|
||||
runtime_tmpdir=None,
|
||||
console=False,
|
||||
disable_windowed_traceback=False,
|
||||
argv_emulation=False,
|
||||
target_arch=None,
|
||||
codesign_identity=None,
|
||||
entitlements_file=None,
|
||||
)
|
||||
@@ -1,3 +1,4 @@
|
||||
pypdf
|
||||
pdfplumber
|
||||
pdfminer.six
|
||||
rapidfuzz
|
||||
PyQt6
|
||||
20
uff_app.py
20
uff_app.py
@@ -1,7 +1,7 @@
|
||||
import sys
|
||||
import os
|
||||
import sqlite3
|
||||
from pypdf import PdfReader
|
||||
import pdfplumber
|
||||
|
||||
# NEU: Für die Fuzzy-Logik
|
||||
from rapidfuzz import process, fuzz
|
||||
@@ -120,19 +120,20 @@ class DatabaseHandler:
|
||||
scored_results = []
|
||||
|
||||
for filename, path, snippet, content in rows:
|
||||
# Wir berechnen Scores
|
||||
score_name = fuzz.partial_ratio(query.lower(), filename.lower())
|
||||
# Wir berechnen Scores mit besserer Gewichtung
|
||||
score_name = fuzz.WRatio(query.lower(), filename.lower())
|
||||
|
||||
# Content-Check: Wir nehmen Content (falls snippet zu kurz ist)
|
||||
# Begrenzung auf die ersten 5000 Zeichen für Performance
|
||||
check_content = content[:5000] if content else ""
|
||||
score_content = fuzz.partial_token_set_ratio(query.lower(), check_content.lower())
|
||||
|
||||
final_score = max(score_name, score_content)
|
||||
# Gewichteter Durchschnitt: Inhalt ist wichtiger als Dateiname
|
||||
final_score = (score_name * 0.2) + (score_content * 0.8)
|
||||
|
||||
# Bonus für exakte Wort-Treffer
|
||||
# Bonus für exakte Wort-Treffer (jetzt stärker)
|
||||
if all(w.lower() in (filename + check_content).lower() for w in words):
|
||||
final_score += 10
|
||||
final_score += 20
|
||||
|
||||
# Filter: Nur anzeigen, wenn Score halbwegs okay ist
|
||||
# Bei "vertraaag" vs "vertrag" ist der Score meist > 70
|
||||
@@ -166,10 +167,11 @@ class IndexerThread(QThread):
|
||||
ext = os.path.splitext(filepath)[1].lower()
|
||||
try:
|
||||
if ext == ".pdf":
|
||||
reader = PdfReader(filepath)
|
||||
with pdfplumber.open(filepath) as pdf:
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
if page_text := page.extract_text(): text += page_text + "\n"
|
||||
for page in pdf.pages:
|
||||
if page_text := page.extract_text():
|
||||
text += page_text + "\n"
|
||||
return text
|
||||
elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
|
||||
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
||||
|
||||
Reference in New Issue
Block a user