Chapter 9 of 16
6. PRACTICAL ENGINEERING SKILLS
(This section is about making your system actually work in production. Everything before this assumed clean data, perfect uptime, and users who ask well-formed questions. None of that is true. Real documents are messy PDFs with broken encoding. Real users ask ambiguous questions. Real systems crash at 3am. The code below handles these realities.)
Document Processing Pipeline
End-to-End Pipeline
class DocumentProcessor:
def __init__(self):
self.supported_formats = ['.pdf', '.docx', '.txt', '.md', '.html']
def process_document(self, file_path):
"""
Complete document processing pipeline
"""
# Step 1: Extract text
raw_text = self.extract_text(file_path)
# Step 2: Clean text
cleaned_text = self.clean_text(raw_text)
# Step 3: Extract metadata
metadata = self.extract_metadata(file_path, cleaned_text)
# Step 4: Chunk text
chunks = self.chunk_text(cleaned_text)
# Step 5: Generate embeddings
chunk_objects = self.create_chunk_objects(chunks, metadata)
return chunk_objects
def extract_text(self, file_path):
"""Extract text from various formats"""
ext = Path(file_path).suffix.lower()
if ext == '.pdf':
return self.extract_from_pdf(file_path)
elif ext == '.docx':
return self.extract_from_docx(file_path)
elif ext in ['.txt', '.md']:
return Path(file_path).read_text(encoding='utf-8')
elif ext == '.html':
return self.extract_from_html(file_path)
def extract_from_pdf(self, file_path):
"""Extract text from PDF"""
import PyPDF2
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def clean_text(self, text):
"""Clean extracted text"""
import re
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove page numbers (simple heuristic)
text = re.sub(r'\n\d+\n', '\n', text)
# Fix broken words (simple version)
text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
return text.strip()
def chunk_text(self, text, chunk_size=1000, overlap=200):
"""Chunk text with overlap"""
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = splitter.split_text(text)
return chunks
def create_chunk_objects(self, chunks, metadata):
"""Create chunk objects with embeddings"""
chunk_objects = []
for i, chunk in enumerate(chunks):
chunk_obj = {
"id": f"{metadata['file_name']}_{i}",
"content": chunk,
"metadata": {
**metadata,
"chunk_index": i,
"char_count": len(chunk)
},
"embedding": get_embedding(chunk)
}
chunk_objects.append(chunk_obj)
return chunk_objects
Handling Different File Types
def process_code_files(file_path):
"""Special handling for code files"""
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
# Detect language
ext = Path(file_path).suffix
language_map = {
'.py': Language.PYTHON,
'.js': Language.JS,
'.java': Language.JAVA,
}
language = language_map.get(ext)
if language:
splitter = RecursiveCharacterTextSplitter.from_language(
language=language,
chunk_size=500,
chunk_overlap=50
)
code = Path(file_path).read_text()
chunks = splitter.split_text(code)
return chunks