From 1855810c140876c661dc2bbce5d63058bafb0722 Mon Sep 17 00:00:00 2001
From: Konstantin <konstantin.rossmann@gmail.com>
Date: Sat, 10 Jan 2026 13:23:01 +0100
Subject: [PATCH] add some docstring

---
 README.md   | 36 +++++++++++++++++----
 config.py   | 15 +++++++++
 database.py | 89 +++++++++++++++++++++++++++++++++++++++------------
 indexer.py  | 91 ++++++++++++++++++++++++++++++++++++++++++-----------
 main.py     | 44 +++++++++++++-------------
 5 files changed, 208 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index a5507f6..f415993 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,11 @@
-![UFF Banner](assets/uff_banner.jpeg)
+[![UFF Banner](assets/uff_banner.jpeg)](https://github.com/BildoBeucklin/unsorted-folder-full-text-search)
 
 # UFF Search - Unsorted Folder Full-Text Search
 
+![GitHub stars](https://img.shields.io/github/stars/BildoBeucklin/unsorted-folder-full-text-search?style=social)
+![GitHub forks](https://img.shields.io/github/forks/BildoBeucklin/unsorted-folder-full-text-search?style=social)
+![GitHub license](https://img.shields.io/github/license/BildoBeucklin/unsorted-folder-full-text-search)
+
 UFF Search is a powerful desktop application for Windows that allows you to perform fast, intelligent, and fuzzy full-text searches on your local files, including searching inside ZIP archives.
 
 It builds a local search index for the folders you specify, allowing you to quickly find documents based on their meaning (semantic search) and specific keywords, even with typos in your search query.
@@ -13,6 +17,7 @@ It builds a local search index for the folders you specify, allowing you to quic
 *   **Fuzzy Search:** Finds relevant files even if your search term has typos, powered by `rapidfuzz`.
 *   **Wide File Type Support:** Extracts text from:
     *   PDFs (`.pdf`)
+    *   Microsoft Office (`.docx`, `.xlsx`, `.pptx`)
     *   Plain text formats (`.txt`, `.md`, `.py`, `.json`, `.csv`, `.html`, `.log`, `.ini`, `.xml`)
 *   **Simple UI:** An easy-to-use interface to manage your indexed folders and view search results.
 *   **Click to Open:** Search results can be clicked to open the file directly (or the containing ZIP archive).
@@ -33,14 +38,13 @@ A hybrid scoring system ranks the results, giving you the best of both worlds.
 A pre-built installer (`UFF_Search_Installer_v3.exe`) is available for easy installation. This is the recommended method for most users.
 
 ### From Source
-To run the application from the source code, you'll need Python 3 and the following dependencies:
+To run the application from the source code, you'll need Python 3.
 
 1.  **Clone the repository:**
     ```bash
     git clone https://github.com/BildoBeucklin/unsorted-folder-full-text-search.git
     cd unsorted-folder-full-text-search
     ```
-    *(Note: You might need to update the repository URL)*
 
 2.  **Install dependencies:**
     It is highly recommended to use a virtual environment.
@@ -50,9 +54,24 @@ To run the application from the source code, you'll need Python 3 and the follow
 
 3.  **Run the application:**
     ```bash
-    python uff_app.py
+    python main.py
     ```
 
+## Building from Source
+
+To create a standalone executable from the source code, you can use `pyinstaller`:
+
+1.  **Install PyInstaller:**
+    ```bash
+    pip install pyinstaller
+    ```
+
+2.  **Build the executable:**
+    ```bash
+    pyinstaller --name "UFF_Search" --windowed --onefile --icon="favicon.ico" --add-data "assets;assets" main.py
+    ```
+This command will create a single executable file in the `dist` folder.
+
 ## Usage
 
 1.  Start the application.
@@ -69,9 +88,14 @@ To run the application from the source code, you'll need Python 3 and the follow
 *   **Search Technology:**
     *   `sentence-transformers` (specifically `all-MiniLM-L6-v2`) for semantic search.
     *   `rapidfuzz` for fuzzy string matching.
-*   **File Processing:** `pdfplumber` for PDF text extraction.
+*   **File Processing:** 
+    *   `pdfplumber` for PDF text extraction.
+    *   `python-docx` for `.docx` files.
+    *   `openpyxl` for `.xlsx` files.
+    *   `python-pptx` for `.pptx` files.
 *   **Index Location:** The search index database (`uff_index.db`) is stored in `%LOCALAPPDATA%\UFF_Search` on Windows.
 
 ## License
 
-This project is licensed under the GNU Affero General Public License v3.0. See the [LICENSE](LICENSE) file for details.
\ No newline at end of file
+This project is licensed under the GNU Affero General Public License v3.0. See the [LICENSE](LICENSE) file for details.
+This license requires that if you use this software in a product or service that is accessed over a network, you must also make the source code available to the users of that product or service.
\ No newline at end of file
diff --git a/config.py b/config.py
index f1fab20..46939b6 100644
--- a/config.py
+++ b/config.py
@@ -18,15 +18,30 @@ LOG_FILE = os.path.join(APP_DATA_DIR, "uff.log")
 # --- LOGGING KLASSE ---
 class Logger(object):
     def __init__(self):
+        # "w" überschreibt bei jedem Start. Nutze "a" für anhängen (append).
+        self.terminal = sys.stdout # Optional: Falls du es AUCH im Terminal sehen willst
         self.log = open(LOG_FILE, "w", encoding="utf-8")
 
     def write(self, message):
+        # Optional: ins Terminal schreiben (auskommentieren, wenn du nur Logfile willst)
+        # self.terminal.write(message) 
+        
         self.log.write(message)
         self.log.flush()
 
     def flush(self):
+        # self.terminal.flush()
         self.log.flush()
 
+# --- AKTIVIERUNG DES LOGGERS ---
+# Das passiert jetzt sofort beim Import dieser Datei!
+sys.stdout = Logger()
+sys.stderr = sys.stdout # Fehler auch ins Log umleiten
+
+print(f"--- LOGGER START ---")
+print(f"Logfile: {LOG_FILE}")
+
+
 # --- QT MESSAGE HANDLER (Filter) ---
 def qt_message_handler(mode, context, message):
     msg_lower = message.lower()
diff --git a/database.py b/database.py
index 3c5256b..455e9ce 100644
--- a/database.py
+++ b/database.py
@@ -2,19 +2,31 @@
 import sqlite3
 import os
 import numpy as np
-import traceback  # WICHTIG: Damit wir den vollen Fehler sehen
+import traceback 
 from sentence_transformers import util
 from rapidfuzz import fuzz
 from config import DB_NAME, APP_DATA_DIR
 
 class DatabaseHandler:
+    """
+    Handles all database operations, including initialization,
+    folder management, and searching.
+    """
     def __init__(self):
+        """
+        Initializes the DatabaseHandler, sets up the database path,
+        and initializes the database schema.
+        """
         self.app_data_dir = APP_DATA_DIR
         self.db_name = DB_NAME
         self.model = None 
         self.init_db()
 
     def init_db(self):
+        """
+        Initializes the database schema by creating the necessary tables
+        (documents, folders, embeddings) if they don't already exist.
+        """
         conn = sqlite3.connect(self.db_name)
         cursor = conn.cursor()
         cursor.execute("CREATE VIRTUAL TABLE IF NOT EXISTS documents USING fts5(filename, path, content);")
@@ -24,45 +36,82 @@ class DatabaseHandler:
         conn.close()
 
     def add_folder(self, path):
+        """
+        Adds a new folder path to the database to be indexed.
+
+        Args:
+            path (str): The absolute path of the folder to add.
+
+        Returns:
+            bool: True if the folder was added successfully, False otherwise.
+        """
         conn = sqlite3.connect(self.db_name)
         try:
             conn.execute("INSERT OR IGNORE INTO folders (path, alias) VALUES (?, ?)", (path, os.path.basename(path)))
             conn.commit()
             return True
-        except: return False
-        finally: conn.close()
+        except Exception:
+            return False
+        finally:
+            conn.close()
 
     def remove_folder(self, path):
+        """
+        Removes a folder and all its associated indexed files from the database.
+
+        Args:
+            path (str): The absolute path of the folder to remove.
+        """
         conn = sqlite3.connect(self.db_name)
         cursor = conn.cursor()
+        # Find all document IDs associated with the folder path
         cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{path}%",))
         ids = [row[0] for row in cursor.fetchall()]
         if ids:
+            # Delete documents and their embeddings
             cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{path}%",))
-            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
+            placeholders = ','.join('?' * len(ids))
+            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
+        # Remove the folder entry
         cursor.execute("DELETE FROM folders WHERE path = ?", (path,))
         conn.commit()
         conn.close()
 
     def get_folders(self):
+        """
+        Retrieves a list of all indexed folder paths.
+
+        Returns:
+            list: A list of folder paths.
+        """
         conn = sqlite3.connect(self.db_name)
         rows = conn.execute("SELECT path FROM folders").fetchall()
         conn.close()
         return [r[0] for r in rows]
 
     def search(self, query):
-        # Sicherheitscheck
+        """
+        Performs a hybrid search combining semantic and lexical (keyword) search.
+
+        Args:
+            query (str): The search query.
+
+        Returns:
+            list: A list of search results, each containing
+                  (filename, path, snippet).
+        """
+        # Safety check
         if not query.strip() or not self.model: 
             return []
         
         try:
-            # 1. Semantische Vorbereitung
+            # 1. Semantic Preparation
             q_vec = self.model.encode(query, convert_to_tensor=False)
             
             conn = sqlite3.connect(self.db_name)
             cursor = conn.cursor()
             
-            # Embeddings laden
+            # Load embeddings
             cursor.execute("SELECT doc_id, vec FROM embeddings")
             data = cursor.fetchall()
             doc_ids = [d[0] for d in data]
@@ -71,16 +120,16 @@ class DatabaseHandler:
                 conn.close()
                 return []
 
-            # Umwandlung BLOB -> Numpy Array
-            # Hier knallt es oft, wenn die DB korrupt ist oder Dimensionen nicht passen
+            # Convert BLOB -> Numpy Array
+            # This can fail if the DB is corrupt or dimensions mismatch
             vecs = np.array([np.frombuffer(d[1], dtype=np.float32) for d in data])
             
-            # Cosine Similarity berechnen
+            # Calculate Cosine Similarity
             scores = util.cos_sim(q_vec, vecs)[0].numpy()
             scores = np.clip(scores, 0, 1)
             sem_map = {did: float(s) for did, s in zip(doc_ids, scores)}
 
-            # 2. Lexikalische Suche (FTS)
+            # 2. Lexical Search (FTS)
             words = query.replace('"', '').split()
             if not words: words = [query]
             fts_query = " OR ".join([f'"{w}"*' for w in words])
@@ -88,29 +137,29 @@ class DatabaseHandler:
             try:
                 fts_rows = cursor.execute("SELECT rowid, filename, content FROM documents WHERE documents MATCH ? LIMIT 100", (fts_query,)).fetchall()
             except Exception as e:
-                print(f"FTS Fehler (ignoriert): {e}")
+                print(f"FTS Error (ignored): {e}")
                 fts_rows = []
 
             lex_map = {}
             for did, fname, content in fts_rows:
                 r1 = fuzz.partial_ratio(query.lower(), fname.lower())
-                # Content kürzen für Performance
+                # Truncate content for performance
                 r2 = fuzz.partial_token_set_ratio(query.lower(), content[:5000].lower())
                 lex_map[did] = max(r1, r2) / 100.0
 
             # 3. Hybrid Fusion
             final = {}
-            ALPHA = 0.65
-            BETA = 0.35
+            ALPHA = 0.65  # Weight for semantic score
+            BETA = 0.35   # Weight for lexical score
             for did, s_score in sem_map.items():
                 if s_score < 0.15 and did not in lex_map: continue
                 l_score = lex_map.get(did, 0.0)
                 h_score = (s_score * ALPHA) + (l_score * BETA)
-                # Kleiner Boost wenn beides passt
+                # Small boost if both scores are good
                 if s_score > 0.4 and l_score > 0.6: h_score += 0.1
                 final[did] = h_score
 
-            # 4. Ergebnisse holen
+            # 4. Fetch Results
             sorted_ids = sorted(final.keys(), key=lambda x: final[x], reverse=True)[:50]
             results = []
             for did in sorted_ids:
@@ -121,8 +170,8 @@ class DatabaseHandler:
             return results
 
         except Exception as e:
-            # DIESER TEIL IST NEU: Er schreibt den Fehler ins Logfile
-            print(f"!!! KRITISCHER FEHLER IN DER SUCHE !!!")
-            print(f"Fehler: {e}")
+            # NEW: This part writes the error to the log file
+            print(f"!!! CRITICAL ERROR IN SEARCH !!!")
+            print(f"Error: {e}")
             print(traceback.format_exc())
             return []
\ No newline at end of file
diff --git a/indexer.py b/indexer.py
index 0329648..ece4b80 100644
--- a/indexer.py
+++ b/indexer.py
@@ -6,7 +6,7 @@ import zipfile
 import io
 from PyQt6.QtCore import QThread, pyqtSignal
 
-# Importe optionaler Libraries
+# Optional library imports
 try: import docx
 except ImportError: docx = None
 try: import openpyxl
@@ -15,19 +15,43 @@ try: from pptx import Presentation
 except ImportError: Presentation = None
 
 class IndexerThread(QThread):
+    """
+    A QThread that indexes files in a given folder, extracts their text content,
+    and stores it in a database along with semantic embeddings.
+    """
     progress_signal = pyqtSignal(str)
     finished_signal = pyqtSignal(int, int, bool)
 
     def __init__(self, folder, db_name, model):
+        """
+        Initializes the IndexerThread.
+
+        Args:
+            folder (str): The path to the folder to be indexed.
+            db_name (str): The name of the SQLite database file.
+            model: The sentence-transformer model for creating embeddings.
+        """
         super().__init__()
         self.folder_path = folder
         self.db_name = db_name
         self.model = model
         self.is_running = True
 
-    def stop(self): self.is_running = False
+    def stop(self):
+        """Stops the indexing process."""
+        self.is_running = False
 
     def _extract_text(self, stream, filename):
+        """
+        Extracts text from a file stream based on its extension.
+
+        Args:
+            stream (io.BytesIO): The file stream to read from.
+            filename (str): The name of the file.
+
+        Returns:
+            str: The extracted text content.
+        """
         ext = os.path.splitext(filename)[1].lower()
         text = ""
         try:
@@ -36,13 +60,15 @@ class IndexerThread(QThread):
                     with pdfplumber.open(stream) as pdf:
                         for p in pdf.pages:
                             if t := p.extract_text(): text += t + "\n"
-                except: pass
+                except Exception:
+                    pass
             
             elif ext == ".docx" and docx:
                 try:
                     doc = docx.Document(stream)
                     for para in doc.paragraphs: text += para.text + "\n"
-                except: pass
+                except Exception:
+                    pass
 
             elif ext == ".xlsx" and openpyxl:
                 try:
@@ -52,39 +78,50 @@ class IndexerThread(QThread):
                         for row in sheet.iter_rows(values_only=True):
                             row_text = " ".join([str(c) for c in row if c is not None])
                             if row_text.strip(): text += row_text + "\n"
-                except: pass
+                except Exception:
+                    pass
 
             elif ext == ".pptx" and Presentation:
                 try:
                     prs = Presentation(stream)
                     for i, slide in enumerate(prs.slides):
-                        text += f"\n--- Folie {i+1} ---\n"
+                        text += f"\n--- Slide {i+1} ---\n"
                         for shape in slide.shapes:
                             if shape.has_text_frame:
                                 for p in shape.text_frame.paragraphs:
                                     for r in p.runs: text += r.text + " "
                                     text += "\n"
-                except: pass
+                except Exception:
+                    pass
 
             elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
                 try:
                     content = stream.read()
                     if isinstance(content, str): text = content
                     else: text = content.decode('utf-8', errors='ignore')
-                except: pass
-        except: pass
+                except Exception:
+                    pass
+        except Exception:
+            pass
         return text
 
     def run(self):
+        """
+        Starts the indexing process.
+        
+        Iterates through files in the specified folder, extracts text,
+        and saves it to the database. Emits progress and finished signals.
+        """
         conn = sqlite3.connect(self.db_name)
         cursor = conn.cursor()
         
-        # Cleanup old entries
+        # Cleanup old entries for the folder
         cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
         ids = [r[0] for r in cursor.fetchall()]
         if ids:
             cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
-            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
+            placeholders = ','.join('?' * len(ids))
+            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
             conn.commit()
 
         indexed = 0
@@ -92,11 +129,15 @@ class IndexerThread(QThread):
         cancelled = False
 
         for root, dirs, files in os.walk(self.folder_path):
-            if not self.is_running: cancelled = True; break
+            if not self.is_running:
+                cancelled = True
+                break
             for file in files:
-                if not self.is_running: cancelled = True; break
+                if not self.is_running:
+                    cancelled = True
+                    break
                 path = os.path.join(root, file)
-                self.progress_signal.emit(f"Prüfe: {file}...")
+                self.progress_signal.emit(f"Checking: {file}...")
 
                 if file.lower().endswith('.zip'):
                     try:
@@ -109,7 +150,8 @@ class IndexerThread(QThread):
                                     if content and len(content.strip()) > 20:
                                         self._save(cursor, zi.filename, vpath, content)
                                         indexed += 1
-                    except: skipped += 1
+                    except Exception:
+                        skipped += 1
                 else:
                     try:
                         with open(path, "rb") as f:
@@ -118,17 +160,30 @@ class IndexerThread(QThread):
                         if content and len(content.strip()) > 20:
                             self._save(cursor, file, path, content)
                             indexed += 1
-                        else: skipped += 1
-                    except: skipped += 1
+                        else:
+                            skipped += 1
+                    except Exception:
+                        skipped += 1
 
-            if cancelled: break
+            if cancelled:
+                break
         
         conn.commit()
         conn.close()
         self.finished_signal.emit(indexed, skipped, cancelled)
 
     def _save(self, cursor, fname, path, content):
+        """
+        Saves the extracted content and its embedding to the database.
+
+        Args:
+            cursor: The database cursor.
+            fname (str): The name of the file.
+            path (str): The full path to the file.
+            content (str): The extracted text content.
+        """
         cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (fname, path, content))
         did = cursor.lastrowid
+        # Truncate content for embedding to avoid excessive memory usage
         vec = self.model.encode(content[:8000], convert_to_tensor=False).tobytes()
         cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (did, vec))
\ No newline at end of file
diff --git a/main.py b/main.py
index a2f2c3e..b3cf799 100644
--- a/main.py
+++ b/main.py
@@ -1,38 +1,36 @@
 # main.py
 import sys
 import os
+
+from config import qt_message_handler, LOG_FILE
+
 from PyQt6.QtWidgets import QApplication, QSplashScreen
 from PyQt6.QtGui import QPixmap, QFont
 from PyQt6.QtCore import qInstallMessageHandler
 
-from config import Logger, qt_message_handler, LOG_FILE
 from ui import UffWindow
 
-# 1. Logging Setup
-sys.stdout = Logger()
-sys.stderr = sys.stdout
-print(f"--- APP START ---")
-print(f"Logfile: {LOG_FILE}")
-
-# 2. Filter für Qt Meldungen installieren
 qInstallMessageHandler(qt_message_handler)
 os.environ["QT_LOGGING_RULES"] = "qt.text.font.db=false;qt.qpa.fonts=false"
 
 if __name__ == "__main__":
-    app = QApplication(sys.argv)
-    
-    # Globale Schriftart
-    app.setFont(QFont("Segoe UI", 10))
+    try:
+        app = QApplication(sys.argv)
+        app.setFont(QFont("Segoe UI", 10))
 
-    splash = None
-    if os.path.exists("assets/uff_banner.jpeg"):
-        try:
-            splash = QSplashScreen(QPixmap("assets/uff_banner.jpeg"))
-            splash.show()
-        except: pass
+        splash = None
+        if os.path.exists("assets/uff_banner.jpeg"):
+            try:
+                splash = QSplashScreen(QPixmap("assets/uff_banner.jpeg"))
+                splash.show()
+            except: pass
 
-    window = UffWindow(splash)
-    window.show()
-    window.start_model_loading()
-    
-    sys.exit(app.exec())
\ No newline at end of file
+        window = UffWindow(splash)
+        window.show()
+        window.start_model_loading()
+        
+        sys.exit(app.exec())
+    except Exception as e:
+        import traceback
+        print("CRITICAL MAIN CRASH:")
+        print(traceback.format_exc())
\ No newline at end of file