November 26, 2023 1 min read

Assistants API: File Handling and Knowledge Retrieval

OpenAI Assistants API RAG File Handling AI

Assistants API: File Handling and Knowledge Retrieval

The Assistants API’s retrieval tool enables RAG (Retrieval-Augmented Generation) without building your own vector database. Let’s explore how to effectively use file handling for knowledge-based assistants.

File Upload and Management

from openai import OpenAI
from typing import List, Dict, Optional
from pathlib import Path
import mimetypes

class AssistantFileHandler:
    SUPPORTED_FORMATS = {
        'retrieval': ['.pdf', '.docx', '.txt', '.md', '.html', '.json', '.csv'],
        'code_interpreter': ['.py', '.csv', '.xlsx', '.json', '.txt', '.pdf']
    }

    MAX_FILE_SIZE_MB = 512  # OpenAI limit

    def __init__(self, client: OpenAI):
        self.client = client

    def upload_file(
        self,
        file_path: str,
        purpose: str = "assistants"
    ) -> str:
        """Upload a file for use with assistants."""
        path = Path(file_path)

        # Validate file
        self._validate_file(path)

        with open(path, "rb") as f:
            file = self.client.files.create(file=f, purpose=purpose)

        return file.id

    def _validate_file(self, path: Path):
        """Validate file before upload."""
        if not path.exists():
            raise FileNotFoundError(f"File not found: {path}")

        size_mb = path.stat().st_size / (1024 * 1024)
        if size_mb > self.MAX_FILE_SIZE_MB:
            raise ValueError(f"File too large: {size_mb:.1f}MB (max {self.MAX_FILE_SIZE_MB}MB)")

        suffix = path.suffix.lower()
        all_supported = set(self.SUPPORTED_FORMATS['retrieval'] + self.SUPPORTED_FORMATS['code_interpreter'])
        if suffix not in all_supported:
            raise ValueError(f"Unsupported file format: {suffix}")

    def upload_directory(
        self,
        directory_path: str,
        extensions: List[str] = None
    ) -> List[str]:
        """Upload all matching files from a directory."""
        dir_path = Path(directory_path)
        extensions = extensions or self.SUPPORTED_FORMATS['retrieval']

        file_ids = []
        for ext in extensions:
            for file_path in dir_path.glob(f"**/*{ext}"):
                try:
                    file_id = self.upload_file(str(file_path))
                    file_ids.append(file_id)
                    print(f"Uploaded: {file_path.name} -> {file_id}")
                except Exception as e:
                    print(f"Failed to upload {file_path.name}: {e}")

        return file_ids

    def list_files(self, purpose: str = None) -> List[Dict]:
        """List all uploaded files."""
        files = self.client.files.list()
        result = []

        for f in files.data:
            if purpose is None or f.purpose == purpose:
                result.append({
                    "id": f.id,
                    "filename": f.filename,
                    "purpose": f.purpose,
                    "bytes": f.bytes,
                    "created_at": f.created_at
                })

        return result

    def delete_file(self, file_id: str) -> bool:
        """Delete an uploaded file."""
        self.client.files.delete(file_id)
        return True

    def cleanup_old_files(self, max_age_days: int = 30):
        """Delete files older than specified days."""
        import time

        cutoff = time.time() - (max_age_days * 24 * 60 * 60)
        files = self.list_files()

        deleted = 0
        for f in files:
            if f["created_at"] < cutoff:
                self.delete_file(f["id"])
                deleted += 1

        return deleted

Creating Knowledge-Based Assistants

class KnowledgeAssistant:
    """Assistant with retrieval-augmented generation capabilities."""

    def __init__(self, client: OpenAI):
        self.client = client
        self.file_handler = AssistantFileHandler(client)

    def create_with_knowledge_base(
        self,
        name: str,
        instructions: str,
        file_paths: List[str],
        model: str = "gpt-4-1106-preview"
    ) -> str:
        """Create an assistant with a knowledge base."""

        # Upload all files
        file_ids = []
        for path in file_paths:
            file_id = self.file_handler.upload_file(path)
            file_ids.append(file_id)
            print(f"Uploaded {path} -> {file_id}")

        # Create assistant with retrieval
        assistant = self.client.beta.assistants.create(
            name=name,
            instructions=instructions,
            model=model,
            tools=[{"type": "retrieval"}],
            file_ids=file_ids
        )

        return assistant.id

    def add_files_to_assistant(
        self,
        assistant_id: str,
        file_paths: List[str]
    ):
        """Add more files to an existing assistant."""
        for path in file_paths:
            file_id = self.file_handler.upload_file(path)
            self.client.beta.assistants.files.create(
                assistant_id=assistant_id,
                file_id=file_id
            )

    def remove_file_from_assistant(
        self,
        assistant_id: str,
        file_id: str
    ):
        """Remove a file from an assistant."""
        self.client.beta.assistants.files.delete(
            assistant_id=assistant_id,
            file_id=file_id
        )

    def get_assistant_files(self, assistant_id: str) -> List[Dict]:
        """List all files attached to an assistant."""
        files = self.client.beta.assistants.files.list(assistant_id)
        return [{"id": f.id, "created_at": f.created_at} for f in files.data]

# Example: Create a documentation assistant
client = OpenAI()
kb_assistant = KnowledgeAssistant(client)

assistant_id = kb_assistant.create_with_knowledge_base(
    name="Product Documentation Expert",
    instructions="""You are a product documentation expert.
    Answer questions based ONLY on the provided documentation.
    If information is not in the docs, say so clearly.
    Always cite the source document when providing information.""",
    file_paths=[
        "docs/user_guide.pdf",
        "docs/api_reference.md",
        "docs/faq.txt"
    ]
)

Thread-Level File Attachments

class ThreadFileManager:
    """Manage files at the thread level for context-specific data."""

    def __init__(self, client: OpenAI):
        self.client = client
        self.file_handler = AssistantFileHandler(client)

    def create_thread_with_files(
        self,
        initial_message: str,
        file_paths: List[str]
    ) -> str:
        """Create a thread with file attachments."""
        # Upload files
        file_ids = [
            self.file_handler.upload_file(path)
            for path in file_paths
        ]

        # Create thread with initial message and files
        thread = self.client.beta.threads.create(
            messages=[
                {
                    "role": "user",
                    "content": initial_message,
                    "file_ids": file_ids
                }
            ]
        )

        return thread.id

    def add_file_to_message(
        self,
        thread_id: str,
        message: str,
        file_path: str
    ):
        """Add a message with a file attachment."""
        file_id = self.file_handler.upload_file(file_path)

        return self.client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content=message,
            file_ids=[file_id]
        )

# Example: Analyze a specific document in conversation
thread_files = ThreadFileManager(client)

thread_id = thread_files.create_thread_with_files(
    "Please analyze this financial report and summarize the key metrics.",
    ["reports/q3_financials.pdf"]
)

Handling Generated Files

class GeneratedFileHandler:
    """Handle files generated by the assistant (charts, exports, etc.)."""

    def __init__(self, client: OpenAI):
        self.client = client

    def extract_files_from_response(
        self,
        thread_id: str,
        run_id: str
    ) -> List[Dict]:
        """Extract generated files from assistant response."""
        messages = self.client.beta.threads.messages.list(thread_id)
        generated_files = []

        for message in messages.data:
            if message.role != "assistant":
                continue

            for content in message.content:
                # Check for file annotations
                if hasattr(content, 'text') and content.text.annotations:
                    for annotation in content.text.annotations:
                        if annotation.type == "file_path":
                            generated_files.append({
                                "file_id": annotation.file_path.file_id,
                                "text_reference": annotation.text,
                                "start_index": annotation.start_index,
                                "end_index": annotation.end_index
                            })

                # Check for image files
                if content.type == "image_file":
                    generated_files.append({
                        "file_id": content.image_file.file_id,
                        "type": "image"
                    })

        return generated_files

    def download_file(self, file_id: str, output_path: str):
        """Download a generated file."""
        content = self.client.files.content(file_id)
        with open(output_path, "wb") as f:
            f.write(content.read())
        return output_path

    def process_code_interpreter_output(
        self,
        thread_id: str,
        output_dir: str = "output"
    ) -> List[str]:
        """Download all files generated by code interpreter."""
        import os
        os.makedirs(output_dir, exist_ok=True)

        generated = self.extract_files_from_response(thread_id, None)
        downloaded = []

        for i, file_info in enumerate(generated):
            file_id = file_info["file_id"]
            ext = ".png" if file_info.get("type") == "image" else ".csv"
            output_path = os.path.join(output_dir, f"output_{i}{ext}")
            self.download_file(file_id, output_path)
            downloaded.append(output_path)

        return downloaded

Best Practices for File Handling

file_handling_best_practices = {
    "organization": [
        "Use descriptive filenames",
        "Group related documents together",
        "Version your document uploads"
    ],
    "optimization": [
        "Pre-process large PDFs into smaller chunks",
        "Remove unnecessary formatting from documents",
        "Use plain text where possible for better retrieval"
    ],
    "security": [
        "Don't upload files with sensitive data",
        "Implement access controls for file uploads",
        "Regularly audit and clean up files"
    ],
    "retrieval_quality": [
        "Include descriptive headers in documents",
        "Use clear section titles",
        "Provide context in document introductions"
    ]
}

Tomorrow, we’ll explore the Code Interpreter tool and how to leverage it for data analysis!