add some docstring
This commit is contained in:
91
indexer.py
91
indexer.py
@@ -6,7 +6,7 @@ import zipfile
|
||||
import io
|
||||
from PyQt6.QtCore import QThread, pyqtSignal
|
||||
|
||||
# Importe optionaler Libraries
|
||||
# Optional library imports
|
||||
try: import docx
|
||||
except ImportError: docx = None
|
||||
try: import openpyxl
|
||||
@@ -15,19 +15,43 @@ try: from pptx import Presentation
|
||||
except ImportError: Presentation = None
|
||||
|
||||
class IndexerThread(QThread):
|
||||
"""
|
||||
A QThread that indexes files in a given folder, extracts their text content,
|
||||
and stores it in a database along with semantic embeddings.
|
||||
"""
|
||||
progress_signal = pyqtSignal(str)
|
||||
finished_signal = pyqtSignal(int, int, bool)
|
||||
|
||||
def __init__(self, folder, db_name, model):
|
||||
"""
|
||||
Initializes the IndexerThread.
|
||||
|
||||
Args:
|
||||
folder (str): The path to the folder to be indexed.
|
||||
db_name (str): The name of the SQLite database file.
|
||||
model: The sentence-transformer model for creating embeddings.
|
||||
"""
|
||||
super().__init__()
|
||||
self.folder_path = folder
|
||||
self.db_name = db_name
|
||||
self.model = model
|
||||
self.is_running = True
|
||||
|
||||
def stop(self): self.is_running = False
|
||||
def stop(self):
|
||||
"""Stops the indexing process."""
|
||||
self.is_running = False
|
||||
|
||||
def _extract_text(self, stream, filename):
|
||||
"""
|
||||
Extracts text from a file stream based on its extension.
|
||||
|
||||
Args:
|
||||
stream (io.BytesIO): The file stream to read from.
|
||||
filename (str): The name of the file.
|
||||
|
||||
Returns:
|
||||
str: The extracted text content.
|
||||
"""
|
||||
ext = os.path.splitext(filename)[1].lower()
|
||||
text = ""
|
||||
try:
|
||||
@@ -36,13 +60,15 @@ class IndexerThread(QThread):
|
||||
with pdfplumber.open(stream) as pdf:
|
||||
for p in pdf.pages:
|
||||
if t := p.extract_text(): text += t + "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext == ".docx" and docx:
|
||||
try:
|
||||
doc = docx.Document(stream)
|
||||
for para in doc.paragraphs: text += para.text + "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext == ".xlsx" and openpyxl:
|
||||
try:
|
||||
@@ -52,39 +78,50 @@ class IndexerThread(QThread):
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
row_text = " ".join([str(c) for c in row if c is not None])
|
||||
if row_text.strip(): text += row_text + "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext == ".pptx" and Presentation:
|
||||
try:
|
||||
prs = Presentation(stream)
|
||||
for i, slide in enumerate(prs.slides):
|
||||
text += f"\n--- Folie {i+1} ---\n"
|
||||
text += f"\n--- Slide {i+1} ---\n"
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for p in shape.text_frame.paragraphs:
|
||||
for r in p.runs: text += r.text + " "
|
||||
text += "\n"
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
|
||||
try:
|
||||
content = stream.read()
|
||||
if isinstance(content, str): text = content
|
||||
else: text = content.decode('utf-8', errors='ignore')
|
||||
except: pass
|
||||
except: pass
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return text
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Starts the indexing process.
|
||||
|
||||
Iterates through files in the specified folder, extracts text,
|
||||
and saves it to the database. Emits progress and finished signals.
|
||||
"""
|
||||
conn = sqlite3.connect(self.db_name)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Cleanup old entries
|
||||
# Cleanup old entries for the folder
|
||||
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||
ids = [r[0] for r in cursor.fetchall()]
|
||||
if ids:
|
||||
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
|
||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({','.join('?'*len(ids))})", ids)
|
||||
placeholders = ','.join('?' * len(ids))
|
||||
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
|
||||
conn.commit()
|
||||
|
||||
indexed = 0
|
||||
@@ -92,11 +129,15 @@ class IndexerThread(QThread):
|
||||
cancelled = False
|
||||
|
||||
for root, dirs, files in os.walk(self.folder_path):
|
||||
if not self.is_running: cancelled = True; break
|
||||
if not self.is_running:
|
||||
cancelled = True
|
||||
break
|
||||
for file in files:
|
||||
if not self.is_running: cancelled = True; break
|
||||
if not self.is_running:
|
||||
cancelled = True
|
||||
break
|
||||
path = os.path.join(root, file)
|
||||
self.progress_signal.emit(f"Prüfe: {file}...")
|
||||
self.progress_signal.emit(f"Checking: {file}...")
|
||||
|
||||
if file.lower().endswith('.zip'):
|
||||
try:
|
||||
@@ -109,7 +150,8 @@ class IndexerThread(QThread):
|
||||
if content and len(content.strip()) > 20:
|
||||
self._save(cursor, zi.filename, vpath, content)
|
||||
indexed += 1
|
||||
except: skipped += 1
|
||||
except Exception:
|
||||
skipped += 1
|
||||
else:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
@@ -118,17 +160,30 @@ class IndexerThread(QThread):
|
||||
if content and len(content.strip()) > 20:
|
||||
self._save(cursor, file, path, content)
|
||||
indexed += 1
|
||||
else: skipped += 1
|
||||
except: skipped += 1
|
||||
else:
|
||||
skipped += 1
|
||||
except Exception:
|
||||
skipped += 1
|
||||
|
||||
if cancelled: break
|
||||
if cancelled:
|
||||
break
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
self.finished_signal.emit(indexed, skipped, cancelled)
|
||||
|
||||
def _save(self, cursor, fname, path, content):
|
||||
"""
|
||||
Saves the extracted content and its embedding to the database.
|
||||
|
||||
Args:
|
||||
cursor: The database cursor.
|
||||
fname (str): The name of the file.
|
||||
path (str): The full path to the file.
|
||||
content (str): The extracted text content.
|
||||
"""
|
||||
cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (fname, path, content))
|
||||
did = cursor.lastrowid
|
||||
# Truncate content for embedding to avoid excessive memory usage
|
||||
vec = self.model.encode(content[:8000], convert_to_tensor=False).tobytes()
|
||||
cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (did, vec))
|
||||
Reference in New Issue
Block a user