Back to Blog
4 min read

Code Completion for Data Science Notebooks

Code completion in notebooks goes beyond autocomplete. Context-aware suggestions that understand your data, imports, and analysis goals. Here’s how to build intelligent completion systems.

Context-Aware Completion

from dataclasses import dataclass
from typing import Optional

@dataclass
class NotebookContext:
    imports: list[str]
    defined_variables: dict
    dataframes: dict  # name -> column info
    recent_cells: list[str]
    current_line: str
    cursor_position: int

class NotebookCompleter:
    """Context-aware code completion for notebooks."""

    def __init__(self, client):
        self.client = client

    def build_context(self, notebook_state: dict) -> NotebookContext:
        """Build context from notebook state."""
        imports = []
        variables = {}
        dataframes = {}

        for cell in notebook_state.get("cells", []):
            code = cell.get("source", "")

            # Extract imports
            for line in code.split("\n"):
                if line.startswith("import ") or line.startswith("from "):
                    imports.append(line)

            # Track DataFrames
            if "= pd.read" in code or "= pd.DataFrame" in code:
                # Simple extraction
                var_name = code.split("=")[0].strip()
                dataframes[var_name] = {"columns": [], "shape": "unknown"}

        return NotebookContext(
            imports=imports,
            defined_variables=variables,
            dataframes=dataframes,
            recent_cells=notebook_state.get("cells", [])[-5:],
            current_line=notebook_state.get("current_line", ""),
            cursor_position=notebook_state.get("cursor_position", 0)
        )

    async def complete(
        self,
        context: NotebookContext,
        trigger: str = None
    ) -> list[str]:
        """Generate completions based on context."""

        imports_str = "\n".join(context.imports[:10])
        dfs_str = ", ".join(context.dataframes.keys()) if context.dataframes else "None"

        prompt = f"""Generate code completions for a Jupyter notebook.

Imports:
{imports_str}

DataFrames available: {dfs_str}
Current line: {context.current_line}

Provide 3 likely completions. Consider:
- What the user is likely trying to do
- Available variables and DataFrames
- Common patterns

Return as JSON array: ["completion1", "completion2", "completion3"]"""

        response = await self.client.chat_completion(
            model="gpt-35-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )

        import json
        try:
            return json.loads(response.content)
        except:
            return []

    async def complete_function_call(
        self,
        function_name: str,
        context: NotebookContext
    ) -> str:
        """Complete function call with appropriate arguments."""

        df_info = ""
        if context.dataframes:
            df_info = f"Available DataFrames: {list(context.dataframes.keys())}"

        prompt = f"""Complete this function call.

Function: {function_name}
{df_info}

Provide the complete function call with common arguments filled in.
Return only the code."""

        response = await self.client.chat_completion(
            model="gpt-35-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2
        )

        return response.content.strip()

DataFrame-Aware Completion

class DataFrameCompletion:
    """Completions that understand DataFrame structure."""

    def __init__(self, client):
        self.client = client
        self.df_registry = {}

    def register_dataframe(self, name: str, df):
        """Register a DataFrame for completion context."""
        self.df_registry[name] = {
            "columns": list(df.columns),
            "dtypes": df.dtypes.to_dict(),
            "shape": df.shape
        }

    async def complete_column_access(
        self,
        df_name: str,
        partial: str = ""
    ) -> list[str]:
        """Complete DataFrame column access."""
        if df_name not in self.df_registry:
            return []

        columns = self.df_registry[df_name]["columns"]

        if partial:
            # Filter columns matching partial
            matches = [c for c in columns if c.lower().startswith(partial.lower())]
        else:
            matches = columns

        return [f"{df_name}['{col}']" for col in matches[:10]]

    async def suggest_transformation(
        self,
        df_name: str,
        description: str
    ) -> str:
        """Suggest DataFrame transformation code."""
        if df_name not in self.df_registry:
            return ""

        df_info = self.df_registry[df_name]

        prompt = f"""Generate pandas code for this transformation.

DataFrame '{df_name}':
Columns: {df_info['columns']}
Types: {df_info['dtypes']}

Request: {description}

Return only the code."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )

        return response.content

    async def complete_groupby(
        self,
        df_name: str,
        partial_code: str
    ) -> list[str]:
        """Complete groupby operations."""
        if df_name not in self.df_registry:
            return []

        df_info = self.df_registry[df_name]

        # Identify categorical and numeric columns
        categorical = [c for c, t in df_info["dtypes"].items()
                      if "object" in str(t) or "category" in str(t)]
        numeric = [c for c, t in df_info["dtypes"].items()
                  if "int" in str(t) or "float" in str(t)]

        suggestions = []
        for cat in categorical[:3]:
            for num in numeric[:3]:
                suggestions.append(f"{df_name}.groupby('{cat}')['{num}'].mean()")

        return suggestions

Intelligent Docstring Completion

class DocstringCompleter:
    """Generate docstrings for notebook functions."""

    async def generate_docstring(
        self,
        function_code: str,
        style: str = "google"
    ) -> str:
        """Generate docstring for function."""
        prompt = f"""Generate a {style}-style docstring for this function.

```python
{function_code}

Return only the docstring (including quotes)."""

    response = await self.client.chat_completion(
        model="gpt-35-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1
    )

    return response.content

async def complete_inline_comment(
    self,
    code_line: str,
    surrounding_context: str
) -> str:
    """Generate inline comment for code."""
    prompt = f"""Generate a brief inline comment for this code.

Context: {surrounding_context}

Line to comment: {code_line}

Return only the comment (with #)."""

    response = await self.client.chat_completion(
        model="gpt-35-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )

    return response.content.strip()

## Usage Pattern

```python
# In a notebook
completer = NotebookCompleter(client)
df_completer = DataFrameCompletion(client)

# Register DataFrames when created
df = pd.read_csv("data.csv")
df_completer.register_dataframe("df", df)

# Get completions
completions = await completer.complete(context)

# Get DataFrame-specific completions
column_completions = await df_completer.complete_column_access("df", "cust")

# Get transformation suggestions
transform_code = await df_completer.suggest_transformation(
    "df",
    "Calculate monthly averages by customer"
)

Intelligent code completion transforms notebooks from blank-page intimidation to guided data exploration. The AI understands both the language and your data context.

Michael John Pena

Michael John Pena

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.