Back to Blog
5 min read

Assistants API: File Handling and Knowledge Retrieval

Assistants API: File Handling and Knowledge Retrieval

The Assistants API’s retrieval tool enables RAG (Retrieval-Augmented Generation) without building your own vector database. Let’s explore how to effectively use file handling for knowledge-based assistants.

File Upload and Management

from openai import OpenAI
from typing import List, Dict, Optional
from pathlib import Path
import mimetypes

class AssistantFileHandler:
    SUPPORTED_FORMATS = {
        'retrieval': ['.pdf', '.docx', '.txt', '.md', '.html', '.json', '.csv'],
        'code_interpreter': ['.py', '.csv', '.xlsx', '.json', '.txt', '.pdf']
    }

    MAX_FILE_SIZE_MB = 512  # OpenAI limit

    def __init__(self, client: OpenAI):
        self.client = client

    def upload_file(
        self,
        file_path: str,
        purpose: str = "assistants"
    ) -> str:
        """Upload a file for use with assistants."""
        path = Path(file_path)

        # Validate file
        self._validate_file(path)

        with open(path, "rb") as f:
            file = self.client.files.create(file=f, purpose=purpose)

        return file.id

    def _validate_file(self, path: Path):
        """Validate file before upload."""
        if not path.exists():
            raise FileNotFoundError(f"File not found: {path}")

        size_mb = path.stat().st_size / (1024 * 1024)
        if size_mb > self.MAX_FILE_SIZE_MB:
            raise ValueError(f"File too large: {size_mb:.1f}MB (max {self.MAX_FILE_SIZE_MB}MB)")

        suffix = path.suffix.lower()
        all_supported = set(self.SUPPORTED_FORMATS['retrieval'] + self.SUPPORTED_FORMATS['code_interpreter'])
        if suffix not in all_supported:
            raise ValueError(f"Unsupported file format: {suffix}")

    def upload_directory(
        self,
        directory_path: str,
        extensions: List[str] = None
    ) -> List[str]:
        """Upload all matching files from a directory."""
        dir_path = Path(directory_path)
        extensions = extensions or self.SUPPORTED_FORMATS['retrieval']

        file_ids = []
        for ext in extensions:
            for file_path in dir_path.glob(f"**/*{ext}"):
                try:
                    file_id = self.upload_file(str(file_path))
                    file_ids.append(file_id)
                    print(f"Uploaded: {file_path.name} -> {file_id}")
                except Exception as e:
                    print(f"Failed to upload {file_path.name}: {e}")

        return file_ids

    def list_files(self, purpose: str = None) -> List[Dict]:
        """List all uploaded files."""
        files = self.client.files.list()
        result = []

        for f in files.data:
            if purpose is None or f.purpose == purpose:
                result.append({
                    "id": f.id,
                    "filename": f.filename,
                    "purpose": f.purpose,
                    "bytes": f.bytes,
                    "created_at": f.created_at
                })

        return result

    def delete_file(self, file_id: str) -> bool:
        """Delete an uploaded file."""
        self.client.files.delete(file_id)
        return True

    def cleanup_old_files(self, max_age_days: int = 30):
        """Delete files older than specified days."""
        import time

        cutoff = time.time() - (max_age_days * 24 * 60 * 60)
        files = self.list_files()

        deleted = 0
        for f in files:
            if f["created_at"] < cutoff:
                self.delete_file(f["id"])
                deleted += 1

        return deleted

Creating Knowledge-Based Assistants

class KnowledgeAssistant:
    """Assistant with retrieval-augmented generation capabilities."""

    def __init__(self, client: OpenAI):
        self.client = client
        self.file_handler = AssistantFileHandler(client)

    def create_with_knowledge_base(
        self,
        name: str,
        instructions: str,
        file_paths: List[str],
        model: str = "gpt-4-1106-preview"
    ) -> str:
        """Create an assistant with a knowledge base."""

        # Upload all files
        file_ids = []
        for path in file_paths:
            file_id = self.file_handler.upload_file(path)
            file_ids.append(file_id)
            print(f"Uploaded {path} -> {file_id}")

        # Create assistant with retrieval
        assistant = self.client.beta.assistants.create(
            name=name,
            instructions=instructions,
            model=model,
            tools=[{"type": "retrieval"}],
            file_ids=file_ids
        )

        return assistant.id

    def add_files_to_assistant(
        self,
        assistant_id: str,
        file_paths: List[str]
    ):
        """Add more files to an existing assistant."""
        for path in file_paths:
            file_id = self.file_handler.upload_file(path)
            self.client.beta.assistants.files.create(
                assistant_id=assistant_id,
                file_id=file_id
            )

    def remove_file_from_assistant(
        self,
        assistant_id: str,
        file_id: str
    ):
        """Remove a file from an assistant."""
        self.client.beta.assistants.files.delete(
            assistant_id=assistant_id,
            file_id=file_id
        )

    def get_assistant_files(self, assistant_id: str) -> List[Dict]:
        """List all files attached to an assistant."""
        files = self.client.beta.assistants.files.list(assistant_id)
        return [{"id": f.id, "created_at": f.created_at} for f in files.data]

# Example: Create a documentation assistant
client = OpenAI()
kb_assistant = KnowledgeAssistant(client)

assistant_id = kb_assistant.create_with_knowledge_base(
    name="Product Documentation Expert",
    instructions="""You are a product documentation expert.
    Answer questions based ONLY on the provided documentation.
    If information is not in the docs, say so clearly.
    Always cite the source document when providing information.""",
    file_paths=[
        "docs/user_guide.pdf",
        "docs/api_reference.md",
        "docs/faq.txt"
    ]
)

Thread-Level File Attachments

class ThreadFileManager:
    """Manage files at the thread level for context-specific data."""

    def __init__(self, client: OpenAI):
        self.client = client
        self.file_handler = AssistantFileHandler(client)

    def create_thread_with_files(
        self,
        initial_message: str,
        file_paths: List[str]
    ) -> str:
        """Create a thread with file attachments."""
        # Upload files
        file_ids = [
            self.file_handler.upload_file(path)
            for path in file_paths
        ]

        # Create thread with initial message and files
        thread = self.client.beta.threads.create(
            messages=[
                {
                    "role": "user",
                    "content": initial_message,
                    "file_ids": file_ids
                }
            ]
        )

        return thread.id

    def add_file_to_message(
        self,
        thread_id: str,
        message: str,
        file_path: str
    ):
        """Add a message with a file attachment."""
        file_id = self.file_handler.upload_file(file_path)

        return self.client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content=message,
            file_ids=[file_id]
        )

# Example: Analyze a specific document in conversation
thread_files = ThreadFileManager(client)

thread_id = thread_files.create_thread_with_files(
    "Please analyze this financial report and summarize the key metrics.",
    ["reports/q3_financials.pdf"]
)

Handling Generated Files

class GeneratedFileHandler:
    """Handle files generated by the assistant (charts, exports, etc.)."""

    def __init__(self, client: OpenAI):
        self.client = client

    def extract_files_from_response(
        self,
        thread_id: str,
        run_id: str
    ) -> List[Dict]:
        """Extract generated files from assistant response."""
        messages = self.client.beta.threads.messages.list(thread_id)
        generated_files = []

        for message in messages.data:
            if message.role != "assistant":
                continue

            for content in message.content:
                # Check for file annotations
                if hasattr(content, 'text') and content.text.annotations:
                    for annotation in content.text.annotations:
                        if annotation.type == "file_path":
                            generated_files.append({
                                "file_id": annotation.file_path.file_id,
                                "text_reference": annotation.text,
                                "start_index": annotation.start_index,
                                "end_index": annotation.end_index
                            })

                # Check for image files
                if content.type == "image_file":
                    generated_files.append({
                        "file_id": content.image_file.file_id,
                        "type": "image"
                    })

        return generated_files

    def download_file(self, file_id: str, output_path: str):
        """Download a generated file."""
        content = self.client.files.content(file_id)
        with open(output_path, "wb") as f:
            f.write(content.read())
        return output_path

    def process_code_interpreter_output(
        self,
        thread_id: str,
        output_dir: str = "output"
    ) -> List[str]:
        """Download all files generated by code interpreter."""
        import os
        os.makedirs(output_dir, exist_ok=True)

        generated = self.extract_files_from_response(thread_id, None)
        downloaded = []

        for i, file_info in enumerate(generated):
            file_id = file_info["file_id"]
            ext = ".png" if file_info.get("type") == "image" else ".csv"
            output_path = os.path.join(output_dir, f"output_{i}{ext}")
            self.download_file(file_id, output_path)
            downloaded.append(output_path)

        return downloaded

Best Practices for File Handling

file_handling_best_practices = {
    "organization": [
        "Use descriptive filenames",
        "Group related documents together",
        "Version your document uploads"
    ],
    "optimization": [
        "Pre-process large PDFs into smaller chunks",
        "Remove unnecessary formatting from documents",
        "Use plain text where possible for better retrieval"
    ],
    "security": [
        "Don't upload files with sensitive data",
        "Implement access controls for file uploads",
        "Regularly audit and clean up files"
    ],
    "retrieval_quality": [
        "Include descriptive headers in documents",
        "Use clear section titles",
        "Provide context in document introductions"
    ]
}

Tomorrow, we’ll explore the Code Interpreter tool and how to leverage it for data analysis!

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.