add some docstring

2026-01-10 13:23:01 +01:00
parent 17b506db77
commit 1855810c14
5 changed files with 208 additions and 67 deletions
--- a/indexer.py
+++ b/indexer.py
@@ -6,7 +6,7 @@ import zipfile
 import io
 from PyQt6.QtCore import QThread, pyqtSignal

-# Importe optionaler Libraries
+# Optional library imports
 try: import docx
 except ImportError: docx = None
 try: import openpyxl
@@ -15,19 +15,43 @@ try: from pptx import Presentation
 except ImportError: Presentation = None

 class IndexerThread(QThread):
+    """
+    A QThread that indexes files in a given folder, extracts their text content,
+    and stores it in a database along with semantic embeddings.
+    """
    progress_signal = pyqtSignal(str)
    finished_signal = pyqtSignal(int, int, bool)

    def __init__(self, folder, db_name, model):
+        """
+        Initializes the IndexerThread.
+
+        Args:
+            folder (str): The path to the folder to be indexed.
+            db_name (str): The name of the SQLite database file.
+            model: The sentence-transformer model for creating embeddings.
+        """
        super().__init__()
        self.folder_path = folder
        self.db_name = db_name
        self.model = model
        self.is_running = True

-    def stop(self): self.is_running = False
+    def stop(self):
+        """Stops the indexing process."""
+        self.is_running = False

    def _extract_text(self, stream, filename):
+        """
+        Extracts text from a file stream based on its extension.
+
+        Args:
+            stream (io.BytesIO): The file stream to read from.
+            filename (str): The name of the file.
+
+        Returns:
+            str: The extracted text content.
+        """
        ext = os.path.splitext(filename)[1].lower()
        text = ""
        try:
@@ -36,13 +60,15 @@ class IndexerThread(QThread):
                    with pdfplumber.open(stream) as pdf:
                        for p in pdf.pages:
                            if t := p.extract_text(): text += t + "\n"
-                except: pass
+                except Exception:
+                    pass
            
            elif ext == ".docx" and docx:
                try:
                    doc = docx.Document(stream)
                    for para in doc.paragraphs: text += para.text + "\n"
-                except: pass
+                except Exception:
+                    pass

            elif ext == ".xlsx" and openpyxl:
                try:
@@ -52,39 +78,50 @@ class IndexerThread(QThread):
                        for row in sheet.iter_rows(values_only=True):
                            row_text = " ".join([str(c) for c in row if c is not None])
                            if row_text.strip(): text += row_text + "\n"
-                except: pass
+                except Exception:
+                    pass

            elif ext == ".pptx" and Presentation:
                try:
                    prs = Presentation(stream)
                    for i, slide in enumerate(prs.slides):
-                        text += f"\n--- Folie {i+1} ---\n"
+                        text += f"\n--- Slide {i+1} ---\n"
                        for shape in slide.shapes:
                            if shape.has_text_frame:
                                for p in shape.text_frame.paragraphs:
                                    for r in p.runs: text += r.text + " "
                                    text += "\n"
-                except: pass
+                except Exception:
+                    pass

            elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
                try:
                    content = stream.read()
                    if isinstance(content, str): text = content
                    else: text = content.decode('utf-8', errors='ignore')
-                except: pass
-        except: pass
+                except Exception:
+                    pass
+        except Exception:
+            pass
        return text

    def run(self):
+        """
+        Starts the indexing process.
+        
+        Iterates through files in the specified folder, extracts text,
+        and saves it to the database. Emits progress and finished signals.
+        """
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
        
-        # Cleanup old entries
+        # Cleanup old entries for the folder
        cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
        ids = [r[0] for r in cursor.fetchall()]
        if ids:
            cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
-            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
+            placeholders = ','.join('?' * len(ids))
+            cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
            conn.commit()

        indexed = 0
@@ -92,11 +129,15 @@ class IndexerThread(QThread):
        cancelled = False

        for root, dirs, files in os.walk(self.folder_path):
-            if not self.is_running: cancelled = True; break
+            if not self.is_running:
+                cancelled = True
+                break
            for file in files:
-                if not self.is_running: cancelled = True; break
+                if not self.is_running:
+                    cancelled = True
+                    break
                path = os.path.join(root, file)
-                self.progress_signal.emit(f"Prüfe: {file}...")
+                self.progress_signal.emit(f"Checking: {file}...")

                if file.lower().endswith('.zip'):
                    try:
@@ -109,7 +150,8 @@ class IndexerThread(QThread):
                                    if content and len(content.strip()) > 20:
                                        self._save(cursor, zi.filename, vpath, content)
                                        indexed += 1
-                    except: skipped += 1
+                    except Exception:
+                        skipped += 1
                else:
                    try:
                        with open(path, "rb") as f:
@@ -118,17 +160,30 @@ class IndexerThread(QThread):
                        if content and len(content.strip()) > 20:
                            self._save(cursor, file, path, content)
                            indexed += 1
-                        else: skipped += 1
-                    except: skipped += 1
+                        else:
+                            skipped += 1
+                    except Exception:
+                        skipped += 1

-            if cancelled: break
+            if cancelled:
+                break
        
        conn.commit()
        conn.close()
        self.finished_signal.emit(indexed, skipped, cancelled)

    def _save(self, cursor, fname, path, content):
+        """
+        Saves the extracted content and its embedding to the database.
+
+        Args:
+            cursor: The database cursor.
+            fname (str): The name of the file.
+            path (str): The full path to the file.
+            content (str): The extracted text content.
+        """
        cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (fname, path, content))
        did = cursor.lastrowid
+        # Truncate content for embedding to avoid excessive memory usage
        vec = self.model.encode(content[:8000], convert_to_tensor=False).tobytes()
        cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (did, vec))