6. PRACTICAL ENGINEERING SKILLS

(This section is about making your system actually work in production. Everything before this assumed clean data, perfect uptime, and users who ask well-formed questions. None of that is true. Real documents are messy PDFs with broken encoding. Real users ask ambiguous questions. Real systems crash at 3am. The code below handles these realities.)

Document Processing Pipeline

End-to-End Pipeline

class DocumentProcessor:
    def __init__(self):
        self.supported_formats = ['.pdf', '.docx', '.txt', '.md', '.html']

    def process_document(self, file_path):
        """
        Complete document processing pipeline
        """
        # Step 1: Extract text
        raw_text = self.extract_text(file_path)

        # Step 2: Clean text
        cleaned_text = self.clean_text(raw_text)

        # Step 3: Extract metadata
        metadata = self.extract_metadata(file_path, cleaned_text)

        # Step 4: Chunk text
        chunks = self.chunk_text(cleaned_text)

        # Step 5: Generate embeddings
        chunk_objects = self.create_chunk_objects(chunks, metadata)

        return chunk_objects

    def extract_text(self, file_path):
        """Extract text from various formats"""
        ext = Path(file_path).suffix.lower()

        if ext == '.pdf':
            return self.extract_from_pdf(file_path)
        elif ext == '.docx':
            return self.extract_from_docx(file_path)
        elif ext in ['.txt', '.md']:
            return Path(file_path).read_text(encoding='utf-8')
        elif ext == '.html':
            return self.extract_from_html(file_path)

    def extract_from_pdf(self, file_path):
        """Extract text from PDF"""
        import PyPDF2
        text = ""
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text

    def clean_text(self, text):
        """Clean extracted text"""
        import re

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove page numbers (simple heuristic)
        text = re.sub(r'\n\d+\n', '\n', text)

        # Fix broken words (simple version)
        text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)

        return text.strip()

    def chunk_text(self, text, chunk_size=1000, overlap=200):
        """Chunk text with overlap"""
        from langchain.text_splitter import RecursiveCharacterTextSplitter

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

        chunks = splitter.split_text(text)
        return chunks

    def create_chunk_objects(self, chunks, metadata):
        """Create chunk objects with embeddings"""
        chunk_objects = []
        for i, chunk in enumerate(chunks):
            chunk_obj = {
                "id": f"{metadata['file_name']}_{i}",
                "content": chunk,
                "metadata": {
                    **metadata,
                    "chunk_index": i,
                    "char_count": len(chunk)
                },
                "embedding": get_embedding(chunk)
            }
            chunk_objects.append(chunk_obj)

        return chunk_objects

Handling Different File Types

def process_code_files(file_path):
    """Special handling for code files"""
    from langchain.text_splitter import Language, RecursiveCharacterTextSplitter

    # Detect language
    ext = Path(file_path).suffix
    language_map = {
        '.py': Language.PYTHON,
        '.js': Language.JS,
        '.java': Language.JAVA,
    }

    language = language_map.get(ext)
    if language:
        splitter = RecursiveCharacterTextSplitter.from_language(
            language=language,
            chunk_size=500,
            chunk_overlap=50
        )
        code = Path(file_path).read_text()
        chunks = splitter.split_text(code)
        return chunks

← Chapter 7 - Hybrid RAG + KG Systems1 / 6