Back to Blog
2 min read

Browser AI with WebGPU: Running Models in the Browser

WebGPU enables running AI models directly in the browser with GPU acceleration. Here’s how to leverage it.

Browser AI with WebGPU

// browser_ai.js - Running AI models in the browser

import { pipeline, env } from '@xenova/transformers';

// Configure for browser execution
env.allowLocalModels = false;
env.useBrowserCache = true;

class BrowserAI {
    constructor() {
        this.models = {};
    }

    async loadModel(task, modelName) {
        // Load model with WebGPU backend when available
        const model = await pipeline(task, modelName, {
            device: navigator.gpu ? 'webgpu' : 'wasm',
            dtype: 'fp16'  // Use half precision for efficiency
        });

        this.models[task] = model;
        return model;
    }

    async embed(texts) {
        if (!this.models['feature-extraction']) {
            await this.loadModel('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
        }

        const embeddings = await this.models['feature-extraction'](texts, {
            pooling: 'mean',
            normalize: true
        });

        return embeddings.tolist();
    }

    async classify(text, labels) {
        if (!this.models['zero-shot-classification']) {
            await this.loadModel('zero-shot-classification', 'Xenova/nli-deberta-v3-small');
        }

        return await this.models['zero-shot-classification'](text, labels);
    }

    async summarize(text) {
        if (!this.models['summarization']) {
            await this.loadModel('summarization', 'Xenova/distilbart-cnn-6-6');
        }

        const result = await this.models['summarization'](text, {
            max_length: 150,
            min_length: 30
        });

        return result[0].summary_text;
    }

    async chat(messages) {
        if (!this.models['text-generation']) {
            await this.loadModel('text-generation', 'Xenova/Phi-3-mini-4k-instruct');
        }

        const prompt = this.formatMessages(messages);
        const result = await this.models['text-generation'](prompt, {
            max_new_tokens: 256,
            temperature: 0.7
        });

        return result[0].generated_text;
    }

    formatMessages(messages) {
        return messages.map(m =>
            m.role === 'user' ? `User: ${m.content}` : `Assistant: ${m.content}`
        ).join('\n') + '\nAssistant:';
    }
}

// Usage in a web application
const ai = new BrowserAI();

// Semantic search in browser
async function semanticSearch(query, documents) {
    const queryEmbedding = await ai.embed([query]);
    const docEmbeddings = await ai.embed(documents);

    const similarities = docEmbeddings.map((emb, i) => ({
        document: documents[i],
        score: cosineSimilarity(queryEmbedding[0], emb)
    }));

    return similarities.sort((a, b) => b.score - a.score);
}

function cosineSimilarity(a, b) {
    const dot = a.reduce((sum, val, i) => sum + val * b[i], 0);
    const magA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
    const magB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
    return dot / (magA * magB);
}

WebGPU brings near-native AI performance to web applications with zero installation.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.