add some docstring
This commit is contained in:
36
README.md
36
README.md
@@ -1,7 +1,11 @@
|
||||

|
||||
[](https://github.com/BildoBeucklin/unsorted-folder-full-text-search)
|
||||
|
||||
# UFF Search - Unsorted Folder Full-Text Search
|
||||
|
||||

|
||||

|
||||

|
||||
|
||||
UFF Search is a powerful desktop application for Windows that allows you to perform fast, intelligent, and fuzzy full-text searches on your local files, including searching inside ZIP archives.
|
||||
|
||||
It builds a local search index for the folders you specify, allowing you to quickly find documents based on their meaning (semantic search) and specific keywords, even with typos in your search query.
|
||||
@@ -13,6 +17,7 @@ It builds a local search index for the folders you specify, allowing you to quic
|
||||
* **Fuzzy Search:** Finds relevant files even if your search term has typos, powered by `rapidfuzz`.
|
||||
* **Wide File Type Support:** Extracts text from:
|
||||
* PDFs (`.pdf`)
|
||||
* Microsoft Office (`.docx`, `.xlsx`, `.pptx`)
|
||||
* Plain text formats (`.txt`, `.md`, `.py`, `.json`, `.csv`, `.html`, `.log`, `.ini`, `.xml`)
|
||||
* **Simple UI:** An easy-to-use interface to manage your indexed folders and view search results.
|
||||
* **Click to Open:** Search results can be clicked to open the file directly (or the containing ZIP archive).
|
||||
@@ -33,14 +38,13 @@ A hybrid scoring system ranks the results, giving you the best of both worlds.
|
||||
A pre-built installer (`UFF_Search_Installer_v3.exe`) is available for easy installation. This is the recommended method for most users.
|
||||
|
||||
### From Source
|
||||
To run the application from the source code, you'll need Python 3 and the following dependencies:
|
||||
To run the application from the source code, you'll need Python 3.
|
||||
|
||||
1. **Clone the repository:**
|
||||
```bash
|
||||
git clone https://github.com/BildoBeucklin/unsorted-folder-full-text-search.git
|
||||
cd unsorted-folder-full-text-search
|
||||
```
|
||||
*(Note: You might need to update the repository URL)*
|
||||
|
||||
2. **Install dependencies:**
|
||||
It is highly recommended to use a virtual environment.
|
||||
@@ -50,9 +54,24 @@ To run the application from the source code, you'll need Python 3 and the follow
|
||||
|
||||
3. **Run the application:**
|
||||
```bash
|
||||
python uff_app.py
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Building from Source
|
||||
|
||||
To create a standalone executable from the source code, you can use `pyinstaller`:
|
||||
|
||||
1. **Install PyInstaller:**
|
||||
```bash
|
||||
pip install pyinstaller
|
||||
```
|
||||
|
||||
2. **Build the executable:**
|
||||
```bash
|
||||
pyinstaller --name "UFF_Search" --windowed --onefile --icon="favicon.ico" --add-data "assets;assets" main.py
|
||||
```
|
||||
This command will create a single executable file in the `dist` folder.
|
||||
|
||||
## Usage
|
||||
|
||||
1. Start the application.
|
||||
@@ -69,9 +88,14 @@ To run the application from the source code, you'll need Python 3 and the follow
|
||||
* **Search Technology:**
|
||||
* `sentence-transformers` (specifically `all-MiniLM-L6-v2`) for semantic search.
|
||||
* `rapidfuzz` for fuzzy string matching.
|
||||
* **File Processing:** `pdfplumber` for PDF text extraction.
|
||||
* **File Processing:**
|
||||
* `pdfplumber` for PDF text extraction.
|
||||
* `python-docx` for `.docx` files.
|
||||
* `openpyxl` for `.xlsx` files.
|
||||
* `python-pptx` for `.pptx` files.
|
||||
* **Index Location:** The search index database (`uff_index.db`) is stored in `%LOCALAPPDATA%\UFF_Search` on Windows.
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the GNU Affero General Public License v3.0. See the [LICENSE](LICENSE) file for details.
|
||||
This project is licensed under the GNU Affero General Public License v3.0. See the [LICENSE](LICENSE) file for details.
|
||||
This license requires that if you use this software in a product or service that is accessed over a network, you must also make the source code available to the users of that product or service.
|
||||
15
config.py
15
config.py
@@ -18,15 +18,30 @@ LOG_FILE = os.path.join(APP_DATA_DIR, "uff.log")
|
||||
# --- LOGGING KLASSE ---
|
||||
class Logger(object):
|
||||
def __init__(self):
|
||||
# "w" überschreibt bei jedem Start. Nutze "a" für anhängen (append).
|
||||
self.terminal = sys.stdout # Optional: Falls du es AUCH im Terminal sehen willst
|
||||
self.log = open(LOG_FILE, "w", encoding="utf-8")
|
||||
|
||||
def write(self, message):
|
||||
# Optional: ins Terminal schreiben (auskommentieren, wenn du nur Logfile willst)
|
||||
# self.terminal.write(message)
|
||||
|
||||
self.log.write(message)
|
||||
self.log.flush()
|
||||
|
||||
def flush(self):
|
||||
# self.terminal.flush()
|
||||
self.log.flush()
|
||||
|
||||
# --- AKTIVIERUNG DES LOGGERS ---
|
||||
# Das passiert jetzt sofort beim Import dieser Datei!
|
||||
sys.stdout = Logger()
|
||||
sys.stderr = sys.stdout # Fehler auch ins Log umleiten
|
||||
|
||||
print(f"--- LOGGER START ---")
|
||||
print(f"Logfile: {LOG_FILE}")
|
||||
|
||||
|
||||
# --- QT MESSAGE HANDLER (Filter) ---
|
||||
def qt_message_handler(mode, context, message):
|
||||
msg_lower = message.lower()
|
||||
|
||||
89
database.py
89
database.py
@@ -2,19 +2,31 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import numpy as np
|
||||
import traceback # WICHTIG: Damit wir den vollen Fehler sehen
|
||||
import traceback
|
||||
from sentence_transformers import util
|
||||
from rapidfuzz import fuzz
|
||||
from config import DB_NAME, APP_DATA_DIR
|
||||
|
||||
class DatabaseHandler:
|
||||
"""
|
||||
Handles all database operations, including initialization,
|
||||
folder management, and searching.
|
||||
"""
|
||||
def __init__(self):
|
||||
"""
|
||||
Initializes the DatabaseHandler, sets up the database path,
|
||||
and initializes the database schema.
|
||||
"""
|
||||
self.app_data_dir = APP_DATA_DIR
|
||||
self.db_name = DB_NAME
|
||||
self.model = None
|
||||
self.init_db()
|
||||
|
||||
def init_db(self):
|
||||
"""
|
||||
Initializes the database schema by creating the necessary tables
|
||||
(documents, folders, embeddings) if they don't already exist.
|
||||
"""
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("CREATE VIRTUAL TABLE IF NOT EXISTS documents USING fts5(filename, path, content);")
|
||||
@@ -24,45 +36,82 @@ class DatabaseHandler:
|
||||
conn.close()
|
||||
|
||||
def add_folder(self, path):
|
||||
"""
|
||||
Adds a new folder path to the database to be indexed.
|
||||
|
||||
Args:
|
||||
path (str): The absolute path of the folder to add.
|
||||
|
||||
Returns:
|
||||
bool: True if the folder was added successfully, False otherwise.
|
||||
"""
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
try:
|
||||
conn.execute("INSERT OR IGNORE INTO folders (path, alias) VALUES (?, ?)", (path, os.path.basename(path)))
|
||||
conn.commit()
|
||||
return True
|
||||
except: return False
|
||||
finally: conn.close()
|
||||
except Exception:
|
||||
return False
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def remove_folder(self, path):
|
||||
"""
|
||||
Removes a folder and all its associated indexed files from the database.
|
||||
|
||||
Args:
|
||||
path (str): The absolute path of the folder to remove.
|
||||
"""
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
cursor = conn.cursor()
|
||||
# Find all document IDs associated with the folder path
|
||||
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
|
||||
ids = [row[0] for row in cursor.fetchall()]
|
||||
if ids:
|
||||
# Delete documents and their embeddings
|
||||
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
|
||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
|
||||
placeholders = ','.join('?' * len(ids))
|
||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
|
||||
# Remove the folder entry
|
||||
cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_folders(self):
|
||||
"""
|
||||
Retrieves a list of all indexed folder paths.
|
||||
|
||||
Returns:
|
||||
list: A list of folder paths.
|
||||
"""
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
rows = conn.execute("SELECT path FROM folders").fetchall()
|
||||
conn.close()
|
||||
return [r[0] for r in rows]
|
||||
|
||||
def search(self, query):
|
||||
# Sicherheitscheck
|
||||
"""
|
||||
Performs a hybrid search combining semantic and lexical (keyword) search.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
|
||||
Returns:
|
||||
list: A list of search results, each containing
|
||||
(filename, path, snippet).
|
||||
"""
|
||||
# Safety check
|
||||
if not query.strip() or not self.model:
|
||||
return []
|
||||
|
||||
try:
|
||||
# 1. Semantische Vorbereitung
|
||||
# 1. Semantic Preparation
|
||||
q_vec = self.model.encode(query, convert_to_tensor=False)
|
||||
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Embeddings laden
|
||||
# Load embeddings
|
||||
cursor.execute("SELECT doc_id, vec FROM embeddings")
|
||||
data = cursor.fetchall()
|
||||
doc_ids = [d[0] for d in data]
|
||||
@@ -71,16 +120,16 @@ class DatabaseHandler:
|
||||
conn.close()
|
||||
return []
|
||||
|
||||
# Umwandlung BLOB -> Numpy Array
|
||||
# Hier knallt es oft, wenn die DB korrupt ist oder Dimensionen nicht passen
|
||||
# Convert BLOB -> Numpy Array
|
||||
# This can fail if the DB is corrupt or dimensions mismatch
|
||||
vecs = np.array([np.frombuffer(d[1], dtype=np.float32) for d in data])
|
||||
|
||||
# Cosine Similarity berechnen
|
||||
# Calculate Cosine Similarity
|
||||
scores = util.cos_sim(q_vec, vecs)[0].numpy()
|
||||
scores = np.clip(scores, 0, 1)
|
||||
sem_map = {did: float(s) for did, s in zip(doc_ids, scores)}
|
||||
|
||||
# 2. Lexikalische Suche (FTS)
|
||||
# 2. Lexical Search (FTS)
|
||||
words = query.replace('"', '').split()
|
||||
if not words: words = [query]
|
||||
fts_query = " OR ".join([f'"{w}"*' for w in words])
|
||||
@@ -88,29 +137,29 @@ class DatabaseHandler:
|
||||
try:
|
||||
fts_rows = cursor.execute("SELECT rowid, filename, content FROM documents WHERE documents MATCH ? LIMIT 100", (fts_query,)).fetchall()
|
||||
except Exception as e:
|
||||
print(f"FTS Fehler (ignoriert): {e}")
|
||||
print(f"FTS Error (ignored): {e}")
|
||||
fts_rows = []
|
||||
|
||||
lex_map = {}
|
||||
for did, fname, content in fts_rows:
|
||||
r1 = fuzz.partial_ratio(query.lower(), fname.lower())
|
||||
# Content kürzen für Performance
|
||||
# Truncate content for performance
|
||||
r2 = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower())
|
||||
lex_map[did] = max(r1, r2) / 100.0
|
||||
|
||||
# 3. Hybrid Fusion
|
||||
final = {}
|
||||
ALPHA = 0.65
|
||||
BETA = 0.35
|
||||
ALPHA = 0.65 # Weight for semantic score
|
||||
BETA = 0.35 # Weight for lexical score
|
||||
for did, s_score in sem_map.items():
|
||||
if s_score < 0.15 and did not in lex_map: continue
|
||||
l_score = lex_map.get(did, 0.0)
|
||||
h_score = (s_score * ALPHA) + (l_score * BETA)
|
||||
# Kleiner Boost wenn beides passt
|
||||
# Small boost if both scores are good
|
||||
if s_score > 0.4 and l_score > 0.6: h_score += 0.1
|
||||
final[did] = h_score
|
||||
|
||||
# 4. Ergebnisse holen
|
||||
# 4. Fetch Results
|
||||
sorted_ids = sorted(final.keys(), key=lambda x: final[x], reverse=True)[:50]
|
||||
results = []
|
||||
for did in sorted_ids:
|
||||
@@ -121,8 +170,8 @@ class DatabaseHandler:
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
# DIESER TEIL IST NEU: Er schreibt den Fehler ins Logfile
|
||||
print(f"!!! KRITISCHER FEHLER IN DER SUCHE !!!")
|
||||
print(f"Fehler: {e}")
|
||||
# NEW: This part writes the error to the log file
|
||||
print(f"!!! CRITICAL ERROR IN SEARCH !!!")
|
||||
print(f"Error: {e}")
|
||||
print(traceback.format_exc())
|
||||
return []
|
||||
91
indexer.py
91
indexer.py
@@ -6,7 +6,7 @@ import zipfile
|
||||
import io
|
||||
from PyQt6.QtCore import QThread, pyqtSignal
|
||||
|
||||
# Importe optionaler Libraries
|
||||
# Optional library imports
|
||||
try: import docx
|
||||
except ImportError: docx = None
|
||||
try: import openpyxl
|
||||
@@ -15,19 +15,43 @@ try: from pptx import Presentation
|
||||
except ImportError: Presentation = None
|
||||
|
||||
class IndexerThread(QThread):
|
||||
"""
|
||||
A QThread that indexes files in a given folder, extracts their text content,
|
||||
and stores it in a database along with semantic embeddings.
|
||||
"""
|
||||
progress_signal = pyqtSignal(str)
|
||||
finished_signal = pyqtSignal(int, int, bool)
|
||||
|
||||
def __init__(self, folder, db_name, model):
|
||||
"""
|
||||
Initializes the IndexerThread.
|
||||
|
||||
Args:
|
||||
folder (str): The path to the folder to be indexed.
|
||||
db_name (str): The name of the SQLite database file.
|
||||
model: The sentence-transformer model for creating embeddings.
|
||||
"""
|
||||
super().__init__()
|
||||
self.folder_path = folder
|
||||
self.db_name = db_name
|
||||
self.model = model
|
||||
self.is_running = True
|
||||
|
||||
def stop(self): self.is_running = False
|
||||
def stop(self):
|
||||
"""Stops the indexing process."""
|
||||
self.is_running = False
|
||||
|
||||
def _extract_text(self, stream, filename):
|
||||
"""
|
||||
Extracts text from a file stream based on its extension.
|
||||
|
||||
Args:
|
||||
stream (io.BytesIO): The file stream to read from.
|
||||
filename (str): The name of the file.
|
||||
|
||||
Returns:
|
||||
str: The extracted text content.
|
||||
"""
|
||||
ext = os.path.splitext(filename)[1].lower()
|
||||
text = ""
|
||||
try:
|
||||
@@ -36,13 +60,15 @@ class IndexerThread(QThread):
|
||||
with pdfplumber.open(stream) as pdf:
|
||||
for p in pdf.pages:
|
||||
if t := p.extract_text(): text += t + "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext == ".docx" and docx:
|
||||
try:
|
||||
doc = docx.Document(stream)
|
||||
for para in doc.paragraphs: text += para.text + "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext == ".xlsx" and openpyxl:
|
||||
try:
|
||||
@@ -52,39 +78,50 @@ class IndexerThread(QThread):
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
row_text = " ".join([str(c) for c in row if c is not None])
|
||||
if row_text.strip(): text += row_text + "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext == ".pptx" and Presentation:
|
||||
try:
|
||||
prs = Presentation(stream)
|
||||
for i, slide in enumerate(prs.slides):
|
||||
text += f"\n--- Folie {i+1} ---\n"
|
||||
text += f"\n--- Slide {i+1} ---\n"
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for p in shape.text_frame.paragraphs:
|
||||
for r in p.runs: text += r.text + " "
|
||||
text += "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
|
||||
try:
|
||||
content = stream.read()
|
||||
if isinstance(content, str): text = content
|
||||
else: text = content.decode('utf-8', errors='ignore')
|
||||
except: pass
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return text
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Starts the indexing process.
|
||||
|
||||
Iterates through files in the specified folder, extracts text,
|
||||
and saves it to the database. Emits progress and finished signals.
|
||||
"""
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Cleanup old entries
|
||||
# Cleanup old entries for the folder
|
||||
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||
ids = [r[0] for r in cursor.fetchall()]
|
||||
if ids:
|
||||
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
|
||||
placeholders = ','.join('?' * len(ids))
|
||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
|
||||
conn.commit()
|
||||
|
||||
indexed = 0
|
||||
@@ -92,11 +129,15 @@ class IndexerThread(QThread):
|
||||
cancelled = False
|
||||
|
||||
for root, dirs, files in os.walk(self.folder_path):
|
||||
if not self.is_running: cancelled = True; break
|
||||
if not self.is_running:
|
||||
cancelled = True
|
||||
break
|
||||
for file in files:
|
||||
if not self.is_running: cancelled = True; break
|
||||
if not self.is_running:
|
||||
cancelled = True
|
||||
break
|
||||
path = os.path.join(root, file)
|
||||
self.progress_signal.emit(f"Prüfe: {file}...")
|
||||
self.progress_signal.emit(f"Checking: {file}...")
|
||||
|
||||
if file.lower().endswith('.zip'):
|
||||
try:
|
||||
@@ -109,7 +150,8 @@ class IndexerThread(QThread):
|
||||
if content and len(content.strip()) > 20:
|
||||
self._save(cursor, zi.filename, vpath, content)
|
||||
indexed += 1
|
||||
except: skipped += 1
|
||||
except Exception:
|
||||
skipped += 1
|
||||
else:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
@@ -118,17 +160,30 @@ class IndexerThread(QThread):
|
||||
if content and len(content.strip()) > 20:
|
||||
self._save(cursor, file, path, content)
|
||||
indexed += 1
|
||||
else: skipped += 1
|
||||
except: skipped += 1
|
||||
else:
|
||||
skipped += 1
|
||||
except Exception:
|
||||
skipped += 1
|
||||
|
||||
if cancelled: break
|
||||
if cancelled:
|
||||
break
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
self.finished_signal.emit(indexed, skipped, cancelled)
|
||||
|
||||
def _save(self, cursor, fname, path, content):
|
||||
"""
|
||||
Saves the extracted content and its embedding to the database.
|
||||
|
||||
Args:
|
||||
cursor: The database cursor.
|
||||
fname (str): The name of the file.
|
||||
path (str): The full path to the file.
|
||||
content (str): The extracted text content.
|
||||
"""
|
||||
cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (fname, path, content))
|
||||
did = cursor.lastrowid
|
||||
# Truncate content for embedding to avoid excessive memory usage
|
||||
vec = self.model.encode(content[:8000], convert_to_tensor=False).tobytes()
|
||||
cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (did, vec))
|
||||
44
main.py
44
main.py
@@ -1,38 +1,36 @@
|
||||
# main.py
|
||||
import sys
|
||||
import os
|
||||
|
||||
from config import qt_message_handler, LOG_FILE
|
||||
|
||||
from PyQt6.QtWidgets import QApplication, QSplashScreen
|
||||
from PyQt6.QtGui import QPixmap, QFont
|
||||
from PyQt6.QtCore import qInstallMessageHandler
|
||||
|
||||
from config import Logger, qt_message_handler, LOG_FILE
|
||||
from ui import UffWindow
|
||||
|
||||
# 1. Logging Setup
|
||||
sys.stdout = Logger()
|
||||
sys.stderr = sys.stdout
|
||||
print(f"--- APP START ---")
|
||||
print(f"Logfile: {LOG_FILE}")
|
||||
|
||||
# 2. Filter für Qt Meldungen installieren
|
||||
qInstallMessageHandler(qt_message_handler)
|
||||
os.environ["QT_LOGGING_RULES"] = "qt.text.font.db=false;qt.qpa.fonts=false"
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = QApplication(sys.argv)
|
||||
|
||||
# Globale Schriftart
|
||||
app.setFont(QFont("Segoe UI", 10))
|
||||
try:
|
||||
app = QApplication(sys.argv)
|
||||
app.setFont(QFont("Segoe UI", 10))
|
||||
|
||||
splash = None
|
||||
if os.path.exists("assets/uff_banner.jpeg"):
|
||||
try:
|
||||
splash = QSplashScreen(QPixmap("assets/uff_banner.jpeg"))
|
||||
splash.show()
|
||||
except: pass
|
||||
splash = None
|
||||
if os.path.exists("assets/uff_banner.jpeg"):
|
||||
try:
|
||||
splash = QSplashScreen(QPixmap("assets/uff_banner.jpeg"))
|
||||
splash.show()
|
||||
except: pass
|
||||
|
||||
window = UffWindow(splash)
|
||||
window.show()
|
||||
window.start_model_loading()
|
||||
|
||||
sys.exit(app.exec())
|
||||
window = UffWindow(splash)
|
||||
window.show()
|
||||
window.start_model_loading()
|
||||
|
||||
sys.exit(app.exec())
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print("CRITICAL MAIN CRASH:")
|
||||
print(traceback.format_exc())
|
||||
Reference in New Issue
Block a user