Cost Optimization for AI APIs with flashQ

AI API costs can spiral out of control quickly. A single GPT-4 call costs $0.03-0.06 per 1K tokens. At scale, that adds up fast. This guide shows you how to use flashQ to implement cost controls that can reduce your AI bills by 50% or more.

Understanding AI API Pricing

Model	Input (1K tokens)	Output (1K tokens)
GPT-4	$0.03	$0.06
GPT-4-Turbo	$0.01	$0.03
GPT-3.5-Turbo	$0.0005	$0.0015
Claude 3 Opus	$0.015	$0.075
Claude 3 Sonnet	$0.003	$0.015
text-embedding-3-small	$0.00002	-

Strategy 1: Rate Limiting

Control throughput to match your budget:

// Set budget-based rate limit
// Budget: $100/day for GPT-4
// Average cost per call: ~$0.10
// Max calls: 1000/day ≈ 42/hour ≈ 0.7/minute

const queue = new Queue('ai-tasks');

// Conservative rate limit
await queue.setRateLimit(40);  // 40 calls per minute max

// Or use hourly rate for smoother distribution
await queue.setRateLimit(42);  // Matches $100/day budget

Strategy 2: Token Tracking

// Track costs per job
new Worker('ai-tasks', async (job) => {
  const response = await openai.chat.completions.create({
    model: job.data.model || 'gpt-4',
    messages: job.data.messages,
  });

  const usage = response.usage;
  const cost = calculateCost(job.data.model, usage);

  // Log cost for tracking
  await db.costs.create({
    jobId: job.id,
    userId: job.data.userId,
    model: job.data.model,
    promptTokens: usage.prompt_tokens,
    completionTokens: usage.completion_tokens,
    cost,
    timestamp: new Date(),
  });

  return {
    result: response.choices[0].message.content,
    usage,
    cost,
  };
});

function calculateCost(model, usage) {
  const pricing = {
    'gpt-4': { input: 0.03, output: 0.06 },
    'gpt-4-turbo': { input: 0.01, output: 0.03 },
    'gpt-3.5-turbo': { input: 0.0005, output: 0.0015 },
  };

  const p = pricing[model] || pricing['gpt-4'];
  return (
    (usage.prompt_tokens / 1000) * p.input +
    (usage.completion_tokens / 1000) * p.output
  );
}

Strategy 3: Budget Alerts

// Check budget before processing
new Worker('ai-tasks', async (job) => {
  const { userId } = job.data;

  // Check user's daily spend
  const today = new Date().toISOString().split('T')[0];
  const dailySpend = await db.costs.sum({
    where: { userId, date: today },
  });

  const userBudget = await db.users.get(userId).then(u => u.dailyBudget);

  if (dailySpend >= userBudget) {
    // Pause user's jobs instead of failing
    await queue.pause(`user-${userId}`);

    // Notify user
    await sendEmail({
      to: user.email,
      subject: 'Daily AI budget reached',
      body: `You've reached your $${userBudget} daily limit.`,
    });

    throw new Error('Budget exceeded');
  }

  // Process normally
  return await processAITask(job);
});

Strategy 4: Smart Model Routing

// Route to cheaper models when appropriate
function selectModel(task) {
  // Simple tasks → GPT-3.5
  if (task.type === 'summarize' || task.type === 'extract') {
    return 'gpt-3.5-turbo';
  }

  // Complex reasoning → GPT-4
  if (task.type === 'analyze' || task.type === 'code') {
    return 'gpt-4-turbo';
  }

  // Default
  return 'gpt-3.5-turbo';
}

// Add jobs with optimal model
await queue.add('process', {
  ...taskData,
  model: selectModel(taskData),
});

Strategy 5: Smart Batching

// Batch similar requests for embeddings
let batchBuffer = [];
let batchTimeout = null;

async function queueEmbedding(text, callback) {
  batchBuffer.push({ text, callback });

  if (!batchTimeout) {
    batchTimeout = setTimeout(async () => {
      const batch = batchBuffer;
      batchBuffer = [];
      batchTimeout = null;

      // One API call for entire batch (up to 2048 inputs)
      const response = await openai.embeddings.create({
        model: 'text-embedding-3-small',
        input: batch.map(b => b.text),
      });

      // Distribute results
      response.data.forEach((emb, i) => {
        batch[i].callback(emb.embedding);
      });
    }, 100); // Wait 100ms to collect batch
  }
}

Strategy 6: Response Caching

// Cache identical requests
import crypto from 'crypto';

new Worker('ai-tasks', async (job) => {
  // Create cache key from request
  const cacheKey = crypto
    .createHash('sha256')
    .update(JSON.stringify({
      model: job.data.model,
      messages: job.data.messages,
    }))
    .digest('hex');

  // Check cache
  const cached = await redis.get(`ai:${cacheKey}`);
  if (cached) {
    return JSON.parse(cached);
  }

  // Call API
  const result = await callOpenAI(job.data);

  // Cache for 1 hour
  await redis.set(`ai:${cacheKey}`, JSON.stringify(result), 'EX', 3600);

  return result;
});

💰 Cost Reduction Summary

Rate Limiting: Prevents runaway costs
Model Routing: 60-90% savings on simple tasks
Batching: ~20% savings on embeddings
Caching: 100% savings on repeated requests

Conclusion

flashQ gives you the tools to control AI costs effectively. Implement these strategies to keep your AI budget under control while delivering great user experiences.

Start Saving Today

Implement cost controls with flashQ.

Get Started →