Back to Blog
7 min read

Feature Engineering with LLMs: AI-Powered Feature Creation

Large Language Models can revolutionize feature engineering. Generate features from unstructured data, create semantic embeddings, and discover feature interactions that traditional methods miss.

LLM-Powered Feature Extraction

import openai
from dataclasses import dataclass
from typing import Optional
import json

@dataclass
class ExtractedFeatures:
    numeric_features: dict
    categorical_features: dict
    text_features: dict
    metadata: dict

class LLMFeatureExtractor:
    """Extract features from unstructured data using LLMs."""

    def __init__(self, client, model: str = "gpt-4"):
        self.client = client
        self.model = model

    async def extract_features_from_text(
        self,
        text: str,
        schema: dict,
        context: str = None
    ) -> ExtractedFeatures:
        """Extract structured features from unstructured text."""

        schema_str = json.dumps(schema, indent=2)

        prompt = f"""Extract structured features from this text.

Text:
{text}

{f'Context: {context}' if context else ''}

Expected Schema:
{schema_str}

Rules:
- Extract only features present in the schema
- Return null for missing values
- Normalize numeric values where appropriate
- Categorize text fields according to schema enums

Return as JSON matching the schema exactly."""

        response = await self.client.chat_completion(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        try:
            features = json.loads(response.content)
            return self._parse_features(features, schema)
        except:
            return ExtractedFeatures({}, {}, {}, {"raw": response.content})

    def _parse_features(self, features: dict, schema: dict) -> ExtractedFeatures:
        """Parse and categorize extracted features."""
        numeric = {}
        categorical = {}
        text = {}

        for key, value in features.items():
            if key in schema:
                field_type = schema[key].get("type", "string")
                if field_type in ["number", "integer", "float"]:
                    numeric[key] = value
                elif field_type == "string" and "enum" in schema[key]:
                    categorical[key] = value
                else:
                    text[key] = value

        return ExtractedFeatures(
            numeric_features=numeric,
            categorical_features=categorical,
            text_features=text,
            metadata={"schema": schema}
        )

    async def extract_product_features(self, description: str) -> dict:
        """Extract product features from description."""

        prompt = f"""Extract product features from this description.

Description:
{description}

Extract:
- price_range: low/medium/high/premium
- category: electronics/clothing/home/food/other
- target_audience: general/youth/professional/senior
- quality_indicators: list of quality-related mentions
- key_benefits: list of main benefits
- sentiment: positive/neutral/negative
- urgency_level: none/low/medium/high

Return as JSON."""

        response = await self.client.chat_completion(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        return json.loads(response.content)

Semantic Feature Generation

import numpy as np
from typing import List

class SemanticFeatureGenerator:
    """Generate semantic features using embeddings and LLMs."""

    def __init__(self, embedding_client, llm_client):
        self.embedding_client = embedding_client
        self.llm_client = llm_client

    async def generate_embeddings(
        self,
        texts: List[str],
        model: str = "text-embedding-ada-002"
    ) -> np.ndarray:
        """Generate embeddings for texts."""
        response = await self.embedding_client.create_embeddings(
            input=texts,
            model=model
        )
        return np.array([e.embedding for e in response.data])

    async def compute_similarity_features(
        self,
        texts: List[str],
        reference_texts: List[str]
    ) -> np.ndarray:
        """Compute similarity features against reference texts."""

        # Get embeddings
        text_embeddings = await self.generate_embeddings(texts)
        ref_embeddings = await self.generate_embeddings(reference_texts)

        # Compute cosine similarities
        similarities = np.dot(text_embeddings, ref_embeddings.T)
        norms = np.outer(
            np.linalg.norm(text_embeddings, axis=1),
            np.linalg.norm(ref_embeddings, axis=1)
        )
        return similarities / norms

    async def generate_semantic_categories(
        self,
        texts: List[str],
        categories: List[str],
        threshold: float = 0.7
    ) -> List[List[str]]:
        """Categorize texts using semantic similarity."""

        similarities = await self.compute_similarity_features(texts, categories)

        results = []
        for i, text in enumerate(texts):
            matched_categories = [
                categories[j]
                for j, sim in enumerate(similarities[i])
                if sim > threshold
            ]
            results.append(matched_categories)

        return results

    async def extract_topics(
        self,
        texts: List[str],
        num_topics: int = 5
    ) -> dict:
        """Extract topics from text collection."""

        sample_texts = texts[:20] if len(texts) > 20 else texts
        combined = "\n---\n".join(sample_texts)

        prompt = f"""Identify the main topics from these texts.

Texts:
{combined}

Extract {num_topics} main topics. For each topic provide:
- topic_name: short descriptive name
- keywords: list of related keywords
- prevalence: estimated percentage of texts containing this topic

Return as JSON array."""

        response = await self.llm_client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )

        return json.loads(response.content)

    async def compute_topic_features(
        self,
        texts: List[str],
        topics: List[dict]
    ) -> np.ndarray:
        """Compute topic membership features."""

        # Create topic descriptions for similarity
        topic_texts = [
            f"{t['topic_name']}: {', '.join(t['keywords'])}"
            for t in topics
        ]

        return await self.compute_similarity_features(texts, topic_texts)

Feature Interaction Discovery

class FeatureInteractionDiscovery:
    """Discover feature interactions using LLMs."""

    def __init__(self, client):
        self.client = client

    async def suggest_interactions(
        self,
        feature_names: List[str],
        feature_descriptions: dict,
        target_description: str
    ) -> List[dict]:
        """Suggest meaningful feature interactions."""

        features_str = "\n".join([
            f"- {name}: {feature_descriptions.get(name, 'No description')}"
            for name in feature_names
        ])

        prompt = f"""Suggest meaningful feature interactions for ML modeling.

Available Features:
{features_str}

Target Variable: {target_description}

For each suggested interaction:
1. Features involved
2. Type of interaction (multiply, ratio, difference, polynomial)
3. Business rationale
4. Expected predictive value

Return as JSON array of interactions."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5
        )

        return json.loads(response.content)

    async def generate_derived_features(
        self,
        df_schema: dict,
        target: str,
        domain: str
    ) -> List[dict]:
        """Generate derived feature suggestions."""

        prompt = f"""Suggest derived features for a {domain} ML problem.

Available columns:
{json.dumps(df_schema, indent=2)}

Target variable: {target}

Suggest derived features including:
- Mathematical transformations (log, sqrt, power)
- Date/time features (day of week, is_weekend, etc.)
- Aggregations (rolling means, counts)
- Business-specific features

For each feature provide:
- name: feature name
- formula: how to compute it
- rationale: why it might be predictive

Return as JSON array."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return json.loads(response.content)

    def apply_suggested_interactions(
        self,
        df,
        interactions: List[dict]
    ):
        """Apply suggested interactions to DataFrame."""
        import pandas as pd

        for interaction in interactions:
            features = interaction["features"]
            interaction_type = interaction["type"]
            name = interaction.get("name", "_".join(features))

            if interaction_type == "multiply":
                df[name] = df[features[0]] * df[features[1]]
            elif interaction_type == "ratio":
                df[name] = df[features[0]] / (df[features[1]] + 1e-8)
            elif interaction_type == "difference":
                df[name] = df[features[0]] - df[features[1]]
            elif interaction_type == "polynomial":
                degree = interaction.get("degree", 2)
                df[name] = df[features[0]] ** degree

        return df

Automated Feature Documentation

class FeatureDocumentationGenerator:
    """Generate feature documentation using LLMs."""

    def __init__(self, client):
        self.client = client

    async def document_features(
        self,
        df,
        feature_names: List[str]
    ) -> dict:
        """Generate documentation for features."""
        import pandas as pd

        # Gather statistics
        stats = {}
        for col in feature_names:
            if col in df.columns:
                col_stats = {
                    "dtype": str(df[col].dtype),
                    "null_count": int(df[col].isnull().sum()),
                    "unique_count": int(df[col].nunique())
                }

                if pd.api.types.is_numeric_dtype(df[col]):
                    col_stats.update({
                        "mean": float(df[col].mean()),
                        "std": float(df[col].std()),
                        "min": float(df[col].min()),
                        "max": float(df[col].max())
                    })
                else:
                    col_stats["sample_values"] = df[col].dropna().head(5).tolist()

                stats[col] = col_stats

        prompt = f"""Generate documentation for these ML features.

Feature Statistics:
{json.dumps(stats, indent=2)}

For each feature provide:
- description: what it represents
- type: numeric/categorical/text/datetime
- business_meaning: business interpretation
- preprocessing_notes: any preprocessing needed
- potential_issues: data quality concerns

Return as JSON with feature names as keys."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return json.loads(response.content)

    async def generate_feature_importance_explanation(
        self,
        feature_importances: dict,
        model_type: str,
        target: str
    ) -> str:
        """Explain feature importance in business terms."""

        prompt = f"""Explain these feature importances in business terms.

Model Type: {model_type}
Target Variable: {target}

Feature Importances (descending):
{json.dumps(feature_importances, indent=2)}

Provide:
1. Summary of most important features
2. Business interpretation of top features
3. Potential action items based on findings
4. Caveats about interpretation"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

Feature Store Integration

class LLMFeatureStore:
    """Feature store with LLM-powered features."""

    def __init__(self, spark, llm_client):
        self.spark = spark
        self.llm_client = llm_client
        self.extractor = LLMFeatureExtractor(llm_client)

    async def register_llm_feature(
        self,
        name: str,
        extraction_prompt: str,
        input_columns: List[str],
        output_schema: dict
    ):
        """Register an LLM-powered feature definition."""
        from pyspark.sql.functions import udf
        from pyspark.sql.types import MapType, StringType

        async def extract_feature(text):
            result = await self.extractor.extract_features_from_text(
                text, output_schema
            )
            return {**result.numeric_features, **result.categorical_features}

        # Register as Spark UDF (simplified - actual implementation needs async handling)
        self.feature_definitions[name] = {
            "prompt": extraction_prompt,
            "inputs": input_columns,
            "schema": output_schema,
            "udf": extract_feature
        }

    def compute_features(
        self,
        df,
        feature_names: List[str]
    ):
        """Compute LLM features on DataFrame."""
        from pyspark.sql.functions import col

        for name in feature_names:
            if name in self.feature_definitions:
                definition = self.feature_definitions[name]
                # Apply feature extraction
                # (Actual implementation requires proper UDF handling)
                pass

        return df

    async def create_semantic_index(
        self,
        df,
        text_column: str,
        index_name: str
    ):
        """Create semantic index for similarity features."""
        semantic_gen = SemanticFeatureGenerator(
            self.llm_client, self.llm_client
        )

        texts = df.select(text_column).collect()
        text_list = [row[0] for row in texts]

        embeddings = await semantic_gen.generate_embeddings(text_list)

        # Store embeddings for later similarity computation
        self.semantic_indices[index_name] = {
            "embeddings": embeddings,
            "texts": text_list
        }

# Usage
store = LLMFeatureStore(spark, llm_client)

# Register LLM feature
await store.register_llm_feature(
    name="customer_sentiment",
    extraction_prompt="Extract customer sentiment from feedback",
    input_columns=["feedback_text"],
    output_schema={
        "sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
        "intensity": {"type": "number"}
    }
)

# Compute features
df_with_features = store.compute_features(df, ["customer_sentiment"])

LLM-powered feature engineering unlocks value from unstructured data. Combine semantic understanding with traditional ML for more powerful predictive models.

Michael John Pena

Michael John Pena

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.