From 1855810c140876c661dc2bbce5d63058bafb0722 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Sat, 10 Jan 2026 13:23:01 +0100 Subject: [PATCH] add some docstring --- README.md | 36 +++++++++++++++++---- config.py | 15 +++++++++ database.py | 89 +++++++++++++++++++++++++++++++++++++++------------ indexer.py | 91 ++++++++++++++++++++++++++++++++++++++++++----------- main.py | 44 +++++++++++++------------- 5 files changed, 208 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index a5507f6..f415993 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ -![UFF Banner](assets/uff_banner.jpeg) +[![UFF Banner](assets/uff_banner.jpeg)](https://github.com/BildoBeucklin/unsorted-folder-full-text-search) # UFF Search - Unsorted Folder Full-Text Search +![GitHub stars](https://img.shields.io/github/stars/BildoBeucklin/unsorted-folder-full-text-search?style=social) +![GitHub forks](https://img.shields.io/github/forks/BildoBeucklin/unsorted-folder-full-text-search?style=social) +![GitHub license](https://img.shields.io/github/license/BildoBeucklin/unsorted-folder-full-text-search) + UFF Search is a powerful desktop application for Windows that allows you to perform fast, intelligent, and fuzzy full-text searches on your local files, including searching inside ZIP archives. It builds a local search index for the folders you specify, allowing you to quickly find documents based on their meaning (semantic search) and specific keywords, even with typos in your search query. @@ -13,6 +17,7 @@ It builds a local search index for the folders you specify, allowing you to quic * **Fuzzy Search:** Finds relevant files even if your search term has typos, powered by `rapidfuzz`. * **Wide File Type Support:** Extracts text from: * PDFs (`.pdf`) + * Microsoft Office (`.docx`, `.xlsx`, `.pptx`) * Plain text formats (`.txt`, `.md`, `.py`, `.json`, `.csv`, `.html`, `.log`, `.ini`, `.xml`) * **Simple UI:** An easy-to-use interface to manage your indexed folders and view search results. * **Click to Open:** Search results can be clicked to open the file directly (or the containing ZIP archive). @@ -33,14 +38,13 @@ A hybrid scoring system ranks the results, giving you the best of both worlds. A pre-built installer (`UFF_Search_Installer_v3.exe`) is available for easy installation. This is the recommended method for most users. ### From Source -To run the application from the source code, you'll need Python 3 and the following dependencies: +To run the application from the source code, you'll need Python 3. 1. **Clone the repository:** ```bash git clone https://github.com/BildoBeucklin/unsorted-folder-full-text-search.git cd unsorted-folder-full-text-search ``` - *(Note: You might need to update the repository URL)* 2. **Install dependencies:** It is highly recommended to use a virtual environment. @@ -50,9 +54,24 @@ To run the application from the source code, you'll need Python 3 and the follow 3. **Run the application:** ```bash - python uff_app.py + python main.py ``` +## Building from Source + +To create a standalone executable from the source code, you can use `pyinstaller`: + +1. **Install PyInstaller:** + ```bash + pip install pyinstaller + ``` + +2. **Build the executable:** + ```bash + pyinstaller --name "UFF_Search" --windowed --onefile --icon="favicon.ico" --add-data "assets;assets" main.py + ``` +This command will create a single executable file in the `dist` folder. + ## Usage 1. Start the application. @@ -69,9 +88,14 @@ To run the application from the source code, you'll need Python 3 and the follow * **Search Technology:** * `sentence-transformers` (specifically `all-MiniLM-L6-v2`) for semantic search. * `rapidfuzz` for fuzzy string matching. -* **File Processing:** `pdfplumber` for PDF text extraction. +* **File Processing:** + * `pdfplumber` for PDF text extraction. + * `python-docx` for `.docx` files. + * `openpyxl` for `.xlsx` files. + * `python-pptx` for `.pptx` files. * **Index Location:** The search index database (`uff_index.db`) is stored in `%LOCALAPPDATA%\UFF_Search` on Windows. ## License -This project is licensed under the GNU Affero General Public License v3.0. See the [LICENSE](LICENSE) file for details. \ No newline at end of file +This project is licensed under the GNU Affero General Public License v3.0. See the [LICENSE](LICENSE) file for details. +This license requires that if you use this software in a product or service that is accessed over a network, you must also make the source code available to the users of that product or service. \ No newline at end of file diff --git a/config.py b/config.py index f1fab20..46939b6 100644 --- a/config.py +++ b/config.py @@ -18,15 +18,30 @@ LOG_FILE = os.path.join(APP_DATA_DIR, "uff.log") # --- LOGGING KLASSE --- class Logger(object): def __init__(self): + # "w" überschreibt bei jedem Start. Nutze "a" für anhängen (append). + self.terminal = sys.stdout # Optional: Falls du es AUCH im Terminal sehen willst self.log = open(LOG_FILE, "w", encoding="utf-8") def write(self, message): + # Optional: ins Terminal schreiben (auskommentieren, wenn du nur Logfile willst) + # self.terminal.write(message) + self.log.write(message) self.log.flush() def flush(self): + # self.terminal.flush() self.log.flush() +# --- AKTIVIERUNG DES LOGGERS --- +# Das passiert jetzt sofort beim Import dieser Datei! +sys.stdout = Logger() +sys.stderr = sys.stdout # Fehler auch ins Log umleiten + +print(f"--- LOGGER START ---") +print(f"Logfile: {LOG_FILE}") + + # --- QT MESSAGE HANDLER (Filter) --- def qt_message_handler(mode, context, message): msg_lower = message.lower() diff --git a/database.py b/database.py index 3c5256b..455e9ce 100644 --- a/database.py +++ b/database.py @@ -2,19 +2,31 @@ import sqlite3 import os import numpy as np -import traceback # WICHTIG: Damit wir den vollen Fehler sehen +import traceback from sentence_transformers import util from rapidfuzz import fuzz from config import DB_NAME, APP_DATA_DIR class DatabaseHandler: + """ + Handles all database operations, including initialization, + folder management, and searching. + """ def __init__(self): + """ + Initializes the DatabaseHandler, sets up the database path, + and initializes the database schema. + """ self.app_data_dir = APP_DATA_DIR self.db_name = DB_NAME self.model = None self.init_db() def init_db(self): + """ + Initializes the database schema by creating the necessary tables + (documents, folders, embeddings) if they don't already exist. + """ conn = sqlite3.connect(self.db_name) cursor = conn.cursor() cursor.execute("CREATE VIRTUAL TABLE IF NOT EXISTS documents USING fts5(filename, path, content);") @@ -24,45 +36,82 @@ class DatabaseHandler: conn.close() def add_folder(self, path): + """ + Adds a new folder path to the database to be indexed. + + Args: + path (str): The absolute path of the folder to add. + + Returns: + bool: True if the folder was added successfully, False otherwise. + """ conn = sqlite3.connect(self.db_name) try: conn.execute("INSERT OR IGNORE INTO folders (path, alias) VALUES (?, ?)", (path, os.path.basename(path))) conn.commit() return True - except: return False - finally: conn.close() + except Exception: + return False + finally: + conn.close() def remove_folder(self, path): + """ + Removes a folder and all its associated indexed files from the database. + + Args: + path (str): The absolute path of the folder to remove. + """ conn = sqlite3.connect(self.db_name) cursor = conn.cursor() + # Find all document IDs associated with the folder path cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",)) ids = [row[0] for row in cursor.fetchall()] if ids: + # Delete documents and their embeddings cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",)) - cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids) + placeholders = ','.join('?' * len(ids)) + cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids) + # Remove the folder entry cursor.execute("DELETE FROM folders WHERE path = ?", (path,)) conn.commit() conn.close() def get_folders(self): + """ + Retrieves a list of all indexed folder paths. + + Returns: + list: A list of folder paths. + """ conn = sqlite3.connect(self.db_name) rows = conn.execute("SELECT path FROM folders").fetchall() conn.close() return [r[0] for r in rows] def search(self, query): - # Sicherheitscheck + """ + Performs a hybrid search combining semantic and lexical (keyword) search. + + Args: + query (str): The search query. + + Returns: + list: A list of search results, each containing + (filename, path, snippet). + """ + # Safety check if not query.strip() or not self.model: return [] try: - # 1. Semantische Vorbereitung + # 1. Semantic Preparation q_vec = self.model.encode(query, convert_to_tensor=False) conn = sqlite3.connect(self.db_name) cursor = conn.cursor() - # Embeddings laden + # Load embeddings cursor.execute("SELECT doc_id, vec FROM embeddings") data = cursor.fetchall() doc_ids = [d[0] for d in data] @@ -71,16 +120,16 @@ class DatabaseHandler: conn.close() return [] - # Umwandlung BLOB -> Numpy Array - # Hier knallt es oft, wenn die DB korrupt ist oder Dimensionen nicht passen + # Convert BLOB -> Numpy Array + # This can fail if the DB is corrupt or dimensions mismatch vecs = np.array([np.frombuffer(d[1], dtype=np.float32) for d in data]) - # Cosine Similarity berechnen + # Calculate Cosine Similarity scores = util.cos_sim(q_vec, vecs)[0].numpy() scores = np.clip(scores, 0, 1) sem_map = {did: float(s) for did, s in zip(doc_ids, scores)} - # 2. Lexikalische Suche (FTS) + # 2. Lexical Search (FTS) words = query.replace('"', '').split() if not words: words = [query] fts_query = " OR ".join([f'"{w}"*' for w in words]) @@ -88,29 +137,29 @@ class DatabaseHandler: try: fts_rows = cursor.execute("SELECT rowid, filename, content FROM documents WHERE documents MATCH ? LIMIT 100", (fts_query,)).fetchall() except Exception as e: - print(f"FTS Fehler (ignoriert): {e}") + print(f"FTS Error (ignored): {e}") fts_rows = [] lex_map = {} for did, fname, content in fts_rows: r1 = fuzz.partial_ratio(query.lower(), fname.lower()) - # Content kürzen für Performance + # Truncate content for performance r2 = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower()) lex_map[did] = max(r1, r2) / 100.0 # 3. Hybrid Fusion final = {} - ALPHA = 0.65 - BETA = 0.35 + ALPHA = 0.65 # Weight for semantic score + BETA = 0.35 # Weight for lexical score for did, s_score in sem_map.items(): if s_score < 0.15 and did not in lex_map: continue l_score = lex_map.get(did, 0.0) h_score = (s_score * ALPHA) + (l_score * BETA) - # Kleiner Boost wenn beides passt + # Small boost if both scores are good if s_score > 0.4 and l_score > 0.6: h_score += 0.1 final[did] = h_score - # 4. Ergebnisse holen + # 4. Fetch Results sorted_ids = sorted(final.keys(), key=lambda x: final[x], reverse=True)[:50] results = [] for did in sorted_ids: @@ -121,8 +170,8 @@ class DatabaseHandler: return results except Exception as e: - # DIESER TEIL IST NEU: Er schreibt den Fehler ins Logfile - print(f"!!! KRITISCHER FEHLER IN DER SUCHE !!!") - print(f"Fehler: {e}") + # NEW: This part writes the error to the log file + print(f"!!! CRITICAL ERROR IN SEARCH !!!") + print(f"Error: {e}") print(traceback.format_exc()) return [] \ No newline at end of file diff --git a/indexer.py b/indexer.py index 0329648..ece4b80 100644 --- a/indexer.py +++ b/indexer.py @@ -6,7 +6,7 @@ import zipfile import io from PyQt6.QtCore import QThread, pyqtSignal -# Importe optionaler Libraries +# Optional library imports try: import docx except ImportError: docx = None try: import openpyxl @@ -15,19 +15,43 @@ try: from pptx import Presentation except ImportError: Presentation = None class IndexerThread(QThread): + """ + A QThread that indexes files in a given folder, extracts their text content, + and stores it in a database along with semantic embeddings. + """ progress_signal = pyqtSignal(str) finished_signal = pyqtSignal(int, int, bool) def __init__(self, folder, db_name, model): + """ + Initializes the IndexerThread. + + Args: + folder (str): The path to the folder to be indexed. + db_name (str): The name of the SQLite database file. + model: The sentence-transformer model for creating embeddings. + """ super().__init__() self.folder_path = folder self.db_name = db_name self.model = model self.is_running = True - def stop(self): self.is_running = False + def stop(self): + """Stops the indexing process.""" + self.is_running = False def _extract_text(self, stream, filename): + """ + Extracts text from a file stream based on its extension. + + Args: + stream (io.BytesIO): The file stream to read from. + filename (str): The name of the file. + + Returns: + str: The extracted text content. + """ ext = os.path.splitext(filename)[1].lower() text = "" try: @@ -36,13 +60,15 @@ class IndexerThread(QThread): with pdfplumber.open(stream) as pdf: for p in pdf.pages: if t := p.extract_text(): text += t + "\n" - except: pass + except Exception: + pass elif ext == ".docx" and docx: try: doc = docx.Document(stream) for para in doc.paragraphs: text += para.text + "\n" - except: pass + except Exception: + pass elif ext == ".xlsx" and openpyxl: try: @@ -52,39 +78,50 @@ class IndexerThread(QThread): for row in sheet.iter_rows(values_only=True): row_text = " ".join([str(c) for c in row if c is not None]) if row_text.strip(): text += row_text + "\n" - except: pass + except Exception: + pass elif ext == ".pptx" and Presentation: try: prs = Presentation(stream) for i, slide in enumerate(prs.slides): - text += f"\n--- Folie {i+1} ---\n" + text += f"\n--- Slide {i+1} ---\n" for shape in slide.shapes: if shape.has_text_frame: for p in shape.text_frame.paragraphs: for r in p.runs: text += r.text + " " text += "\n" - except: pass + except Exception: + pass elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]: try: content = stream.read() if isinstance(content, str): text = content else: text = content.decode('utf-8', errors='ignore') - except: pass - except: pass + except Exception: + pass + except Exception: + pass return text def run(self): + """ + Starts the indexing process. + + Iterates through files in the specified folder, extracts text, + and saves it to the database. Emits progress and finished signals. + """ conn = sqlite3.connect(self.db_name) cursor = conn.cursor() - # Cleanup old entries + # Cleanup old entries for the folder cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",)) ids = [r[0] for r in cursor.fetchall()] if ids: cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",)) - cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids) + placeholders = ','.join('?' * len(ids)) + cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids) conn.commit() indexed = 0 @@ -92,11 +129,15 @@ class IndexerThread(QThread): cancelled = False for root, dirs, files in os.walk(self.folder_path): - if not self.is_running: cancelled = True; break + if not self.is_running: + cancelled = True + break for file in files: - if not self.is_running: cancelled = True; break + if not self.is_running: + cancelled = True + break path = os.path.join(root, file) - self.progress_signal.emit(f"Prüfe: {file}...") + self.progress_signal.emit(f"Checking: {file}...") if file.lower().endswith('.zip'): try: @@ -109,7 +150,8 @@ class IndexerThread(QThread): if content and len(content.strip()) > 20: self._save(cursor, zi.filename, vpath, content) indexed += 1 - except: skipped += 1 + except Exception: + skipped += 1 else: try: with open(path, "rb") as f: @@ -118,17 +160,30 @@ class IndexerThread(QThread): if content and len(content.strip()) > 20: self._save(cursor, file, path, content) indexed += 1 - else: skipped += 1 - except: skipped += 1 + else: + skipped += 1 + except Exception: + skipped += 1 - if cancelled: break + if cancelled: + break conn.commit() conn.close() self.finished_signal.emit(indexed, skipped, cancelled) def _save(self, cursor, fname, path, content): + """ + Saves the extracted content and its embedding to the database. + + Args: + cursor: The database cursor. + fname (str): The name of the file. + path (str): The full path to the file. + content (str): The extracted text content. + """ cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (fname, path, content)) did = cursor.lastrowid + # Truncate content for embedding to avoid excessive memory usage vec = self.model.encode(content[:8000], convert_to_tensor=False).tobytes() cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (did, vec)) \ No newline at end of file diff --git a/main.py b/main.py index a2f2c3e..b3cf799 100644 --- a/main.py +++ b/main.py @@ -1,38 +1,36 @@ # main.py import sys import os + +from config import qt_message_handler, LOG_FILE + from PyQt6.QtWidgets import QApplication, QSplashScreen from PyQt6.QtGui import QPixmap, QFont from PyQt6.QtCore import qInstallMessageHandler -from config import Logger, qt_message_handler, LOG_FILE from ui import UffWindow -# 1. Logging Setup -sys.stdout = Logger() -sys.stderr = sys.stdout -print(f"--- APP START ---") -print(f"Logfile: {LOG_FILE}") - -# 2. Filter für Qt Meldungen installieren qInstallMessageHandler(qt_message_handler) os.environ["QT_LOGGING_RULES"] = "qt.text.font.db=false;qt.qpa.fonts=false" if __name__ == "__main__": - app = QApplication(sys.argv) - - # Globale Schriftart - app.setFont(QFont("Segoe UI", 10)) + try: + app = QApplication(sys.argv) + app.setFont(QFont("Segoe UI", 10)) - splash = None - if os.path.exists("assets/uff_banner.jpeg"): - try: - splash = QSplashScreen(QPixmap("assets/uff_banner.jpeg")) - splash.show() - except: pass + splash = None + if os.path.exists("assets/uff_banner.jpeg"): + try: + splash = QSplashScreen(QPixmap("assets/uff_banner.jpeg")) + splash.show() + except: pass - window = UffWindow(splash) - window.show() - window.start_model_loading() - - sys.exit(app.exec()) \ No newline at end of file + window = UffWindow(splash) + window.show() + window.start_model_loading() + + sys.exit(app.exec()) + except Exception as e: + import traceback + print("CRITICAL MAIN CRASH:") + print(traceback.format_exc()) \ No newline at end of file