Files
2026-01-10 13:23:01 +01:00

189 lines
7.0 KiB
Python

# indexer.py
import os
import sqlite3
import pdfplumber
import zipfile
import io
from PyQt6.QtCore import QThread, pyqtSignal
# Optional library imports
try: import docx
except ImportError: docx = None
try: import openpyxl
except ImportError: openpyxl = None
try: from pptx import Presentation
except ImportError: Presentation = None
class IndexerThread(QThread):
"""
A QThread that indexes files in a given folder, extracts their text content,
and stores it in a database along with semantic embeddings.
"""
progress_signal = pyqtSignal(str)
finished_signal = pyqtSignal(int, int, bool)
def __init__(self, folder, db_name, model):
"""
Initializes the IndexerThread.
Args:
folder (str): The path to the folder to be indexed.
db_name (str): The name of the SQLite database file.
model: The sentence-transformer model for creating embeddings.
"""
super().__init__()
self.folder_path = folder
self.db_name = db_name
self.model = model
self.is_running = True
def stop(self):
"""Stops the indexing process."""
self.is_running = False
def _extract_text(self, stream, filename):
"""
Extracts text from a file stream based on its extension.
Args:
stream (io.BytesIO): The file stream to read from.
filename (str): The name of the file.
Returns:
str: The extracted text content.
"""
ext = os.path.splitext(filename)[1].lower()
text = ""
try:
if ext == ".pdf":
try:
with pdfplumber.open(stream) as pdf:
for p in pdf.pages:
if t := p.extract_text(): text += t + "\n"
except Exception:
pass
elif ext == ".docx" and docx:
try:
doc = docx.Document(stream)
for para in doc.paragraphs: text += para.text + "\n"
except Exception:
pass
elif ext == ".xlsx" and openpyxl:
try:
wb = openpyxl.load_workbook(stream, data_only=True, read_only=True)
for sheet in wb.worksheets:
text += f"\n--- {sheet.title} ---\n"
for row in sheet.iter_rows(values_only=True):
row_text = " ".join([str(c) for c in row if c is not None])
if row_text.strip(): text += row_text + "\n"
except Exception:
pass
elif ext == ".pptx" and Presentation:
try:
prs = Presentation(stream)
for i, slide in enumerate(prs.slides):
text += f"\n--- Slide {i+1} ---\n"
for shape in slide.shapes:
if shape.has_text_frame:
for p in shape.text_frame.paragraphs:
for r in p.runs: text += r.text + " "
text += "\n"
except Exception:
pass
elif ext in [".txt", ".md", ".py", ".json", ".csv", ".html", ".log", ".ini", ".xml"]:
try:
content = stream.read()
if isinstance(content, str): text = content
else: text = content.decode('utf-8', errors='ignore')
except Exception:
pass
except Exception:
pass
return text
def run(self):
"""
Starts the indexing process.
Iterates through files in the specified folder, extracts text,
and saves it to the database. Emits progress and finished signals.
"""
conn = sqlite3.connect(self.db_name)
cursor = conn.cursor()
# Cleanup old entries for the folder
cursor.execute("SELECT rowid FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
ids = [r[0] for r in cursor.fetchall()]
if ids:
cursor.execute("DELETE FROM documents WHERE path LIKE ?", (f"{self.folder_path}%",))
placeholders = ','.join('?' * len(ids))
cursor.execute(f"DELETE FROM embeddings WHERE doc_id IN ({placeholders})", ids)
conn.commit()
indexed = 0
skipped = 0
cancelled = False
for root, dirs, files in os.walk(self.folder_path):
if not self.is_running:
cancelled = True
break
for file in files:
if not self.is_running:
cancelled = True
break
path = os.path.join(root, file)
self.progress_signal.emit(f"Checking: {file}...")
if file.lower().endswith('.zip'):
try:
with zipfile.ZipFile(path, 'r') as z:
for zi in z.infolist():
if zi.is_dir(): continue
vpath = f"{path} :: {zi.filename}"
with z.open(zi) as zf:
content = self._extract_text(io.BytesIO(zf.read()), zi.filename)
if content and len(content.strip()) > 20:
self._save(cursor, zi.filename, vpath, content)
indexed += 1
except Exception:
skipped += 1
else:
try:
with open(path, "rb") as f:
file_content = io.BytesIO(f.read())
content = self._extract_text(file_content, file)
if content and len(content.strip()) > 20:
self._save(cursor, file, path, content)
indexed += 1
else:
skipped += 1
except Exception:
skipped += 1
if cancelled:
break
conn.commit()
conn.close()
self.finished_signal.emit(indexed, skipped, cancelled)
def _save(self, cursor, fname, path, content):
"""
Saves the extracted content and its embedding to the database.
Args:
cursor: The database cursor.
fname (str): The name of the file.
path (str): The full path to the file.
content (str): The extracted text content.
"""
cursor.execute("INSERT INTO documents (filename, path, content) VALUES (?, ?, ?)", (fname, path, content))
did = cursor.lastrowid
# Truncate content for embedding to avoid excessive memory usage
vec = self.model.encode(content[:8000], convert_to_tensor=False).tobytes()
cursor.execute("INSERT INTO embeddings (doc_id, vec) VALUES (?, ?)", (did, vec))