// Advanced Error Handling and Retry Mechanism System // Comprehensive error recovery, circuit breaker, and resilience patterns import { type AIProvider, type AIResponse } from '../types/ai-types.uts' /** * Error classification and handling configuration */ export type ErrorHandlingConfig = { retryPolicy: { maxAttempts: number baseDelayMs: number maxDelayMs: number backoffMultiplier: number jitterEnabled: boolean } circuitBreaker: { failureThreshold: number recoveryTimeoutMs: number halfOpenMaxCalls: number monitoringWindowMs: number } rateLimit: { maxRequestsPerSecond: number burstSize: number enabled: boolean } fallback: { enabled: boolean fallbackProviders: AIProvider[] gracefulDegradation: boolean } monitoring: { enableMetrics: boolean alertOnPatterns: boolean maxErrorHistorySize: number } } /** * Error categories for different handling strategies */ export enum ErrorCategory { TRANSIENT = 'transient', // Network timeouts, temporary unavailability AUTHENTICATION = 'auth', // API key issues, token expiration RATE_LIMIT = 'rate_limit', // API rate limiting QUOTA_EXCEEDED = 'quota', // API quota exceeded INVALID_REQUEST = 'invalid', // Bad request data SERVICE_ERROR = 'service', // Internal service errors NETWORK = 'network', // Network connectivity issues PERMANENT = 'permanent' // Permanent failures that shouldn't be retried } /** * Detailed error information */ export type ErrorInfo = { category: ErrorCategory code?: string message: string provider?: AIProvider operation: string timestamp: number retryCount: number context?: Record isRetryable: boolean suggestedAction?: string } /** * Circuit breaker states */ export enum CircuitBreakerState { CLOSED = 'closed', // Normal operation OPEN = 'open', // Circuit is open, failing fast HALF_OPEN = 'half_open' // Testing if service has recovered } /** * Circuit breaker status */ export type CircuitBreakerStatus = { state: CircuitBreakerState failureCount: number successCount: number lastFailureTime?: number nextAttemptTime?: number halfOpenAttempts: number } /** * Rate limiter status */ export type RateLimiterStatus = { requestsRemaining: number resetTime: number isLimited: boolean queueSize: number } /** * Retry attempt information */ export type RetryAttempt = { attemptNumber: number timestamp: number error?: ErrorInfo delayMs: number success: boolean } /** * Operation result with retry information */ export type OperationResult = { success: boolean data?: T error?: ErrorInfo attempts: RetryAttempt[] totalDuration: number finalProvider?: AIProvider } /** * Advanced error handler and retry manager */ export class AIErrorHandler { private config: ErrorHandlingConfig private circuitBreakers = new Map() private rateLimiters = new Map() private errorHistory: ErrorInfo[] = [] private requestQueues = new Map Promise>>() constructor(config: ErrorHandlingConfig) { this.config = config this.initializeCircuitBreakers() this.initializeRateLimiters() } /** * Execute operation with advanced error handling and retry logic */ async executeWithRetry( operation: () => Promise, context: { operationName: string provider?: AIProvider retryable?: boolean metadata?: Record } ): Promise> { const startTime = Date.now() const attempts: RetryAttempt[] = [] let lastError: ErrorInfo | undefined // Check circuit breaker const breakerKey = this.getBreakerKey(context.operationName, context.provider) if (this.isCircuitOpen(breakerKey)) { const error = this.createError( ErrorCategory.SERVICE_ERROR, `Circuit breaker is open for ${breakerKey}`, context.operationName, context.provider ) return { success: false, error, attempts: [], totalDuration: Date.now() - startTime } } // Check rate limits if (this.config.rateLimit.enabled && context.provider) { const rateLimitResult = await this.checkRateLimit(context.provider) if (!rateLimitResult.allowed) { const error = this.createError( ErrorCategory.RATE_LIMIT, 'Rate limit exceeded', context.operationName, context.provider ) return { success: false, error, attempts: [], totalDuration: Date.now() - startTime } } } // Execute with retry logic for (let attempt = 1; attempt <= this.config.retryPolicy.maxAttempts; attempt++) { const attemptStart = Date.now() try { // Add delay for retry attempts if (attempt > 1) { const delay = this.calculateRetryDelay(attempt - 1) await this.sleep(delay) attempts[attempts.length - 1].delayMs = delay } // Execute the operation const result = await operation() // Record successful attempt const attemptInfo: RetryAttempt = { attemptNumber: attempt, timestamp: attemptStart, delayMs: 0, success: true } attempts.push(attemptInfo) // Update circuit breaker on success this.recordSuccess(breakerKey) return { success: true, data: result, attempts, totalDuration: Date.now() - startTime, finalProvider: context.provider } } catch (error) { const errorInfo = this.analyzeError(error, context.operationName, context.provider, attempt - 1) lastError = errorInfo // Record failed attempt const attemptInfo: RetryAttempt = { attemptNumber: attempt, timestamp: attemptStart, error: errorInfo, delayMs: 0, success: false } attempts.push(attemptInfo) // Update error history this.recordError(errorInfo) // Update circuit breaker on failure this.recordFailure(breakerKey) // Check if we should retry if (!this.shouldRetry(errorInfo, attempt)) { break } // Try fallback provider if available if (this.config.fallback.enabled && attempt === this.config.retryPolicy.maxAttempts) { const fallbackResult = await this.tryFallbackProviders( operation, context, startTime, attempts ) if (fallbackResult) { return fallbackResult } } } } return { success: false, error: lastError, attempts, totalDuration: Date.now() - startTime } } /** * Handle bulk operations with advanced error recovery */ async executeBulkWithRetry( items: T[], operation: (item: T) => Promise, options: { operationName: string batchSize?: number concurrency?: number failFast?: boolean partialFailureThreshold?: number } ): Promise<{ results: Array<{ item: T; result?: R; error?: ErrorInfo }> summary: { successful: number failed: number totalTime: number throughput: number } }> { const startTime = Date.now() const batchSize = options.batchSize || 10 const concurrency = options.concurrency || 3 const results: Array<{ item: T; result?: R; error?: ErrorInfo }> = [] // Process in batches for (let i = 0; i < items.length; i += batchSize) { const batch = items.slice(i, i + batchSize) // Process batch with controlled concurrency const batchPromises = batch.map(async (item) => { const operationResult = await this.executeWithRetry( () => operation(item), { operationName: options.operationName, metadata: { batchIndex: Math.floor(i / batchSize), itemIndex: i + batch.indexOf(item) } } ) return { item, result: operationResult.data, error: operationResult.error } }) // Execute with concurrency control const batchResults = await this.executeConcurrently(batchPromises, concurrency) results.push(...batchResults) // Check failure threshold const failedCount = results.filter(r => r.error).length const failureRate = failedCount / results.length if (options.failFast && failureRate > (options.partialFailureThreshold || 0.5)) { console.log(`⚠️ Bulk operation failing fast due to high failure rate: ${(failureRate * 100).toFixed(1)}%`) break } } const endTime = Date.now() const successful = results.filter(r => !r.error).length const failed = results.filter(r => r.error).length const totalTime = endTime - startTime const throughput = results.length / (totalTime / 1000) return { results, summary: { successful, failed, totalTime, throughput } } } /** * Get current error handling status */ getErrorHandlingStatus(): { circuitBreakers: Array<{ key: string; status: CircuitBreakerStatus }> rateLimiters: Array<{ key: string; status: RateLimiterStatus }> recentErrors: ErrorInfo[] errorPatterns: Array<{ pattern: string; count: number; lastSeen: number }> } { const recentErrors = this.errorHistory.slice(-50) // Last 50 errors const errorPatterns = this.analyzeErrorPatterns(recentErrors) return { circuitBreakers: Array.from(this.circuitBreakers.entries()).map(([key, status]) => ({ key, status })), rateLimiters: Array.from(this.rateLimiters.entries()).map(([key, status]) => ({ key, status })), recentErrors, errorPatterns } } /** * Reset circuit breakers and error state */ resetErrorState(): void { this.circuitBreakers.clear() this.rateLimiters.clear() this.errorHistory = [] this.requestQueues.clear() this.initializeCircuitBreakers() this.initializeRateLimiters() console.log('🔄 Error handling state reset') } /** * Update configuration */ updateConfig(newConfig: Partial): void { this.config = { ...this.config, ...newConfig } console.log('⚙️ Error handling configuration updated') } // Private methods private initializeCircuitBreakers(): void { const providers: AIProvider[] = ['openai', 'google', 'baidu'] const operations = ['translate', 'analyze', 'chat', 'recommend'] providers.forEach(provider => { operations.forEach(operation => { const key = this.getBreakerKey(operation, provider) this.circuitBreakers.set(key, { state: CircuitBreakerState.CLOSED, failureCount: 0, successCount: 0, halfOpenAttempts: 0 }) }) }) } private initializeRateLimiters(): void { const providers: AIProvider[] = ['openai', 'google', 'baidu'] providers.forEach(provider => { this.rateLimiters.set(provider, { requestsRemaining: this.config.rateLimit.maxRequestsPerSecond, resetTime: Date.now() + 1000, isLimited: false, queueSize: 0 }) }) } private getBreakerKey(operation: string, provider?: AIProvider): string { return provider ? `${provider}:${operation}` : operation } private isCircuitOpen(breakerKey: string): boolean { const breaker = this.circuitBreakers.get(breakerKey) if (!breaker) return false if (breaker.state === CircuitBreakerState.OPEN) { // Check if we should transition to half-open const now = Date.now() if (breaker.lastFailureTime && now - breaker.lastFailureTime > this.config.circuitBreaker.recoveryTimeoutMs) { breaker.state = CircuitBreakerState.HALF_OPEN breaker.halfOpenAttempts = 0 console.log(`🔄 Circuit breaker ${breakerKey} transitioning to half-open`) return false } return true } return false } private recordSuccess(breakerKey: string): void { const breaker = this.circuitBreakers.get(breakerKey) if (!breaker) return breaker.successCount++ if (breaker.state === CircuitBreakerState.HALF_OPEN) { breaker.halfOpenAttempts++ if (breaker.halfOpenAttempts >= this.config.circuitBreaker.halfOpenMaxCalls) { breaker.state = CircuitBreakerState.CLOSED breaker.failureCount = 0 console.log(`✅ Circuit breaker ${breakerKey} closed after successful recovery`) } } } private recordFailure(breakerKey: string): void { const breaker = this.circuitBreakers.get(breakerKey) if (!breaker) return breaker.failureCount++ breaker.lastFailureTime = Date.now() if (breaker.state === CircuitBreakerState.CLOSED) { if (breaker.failureCount >= this.config.circuitBreaker.failureThreshold) { breaker.state = CircuitBreakerState.OPEN console.log(`⚠️ Circuit breaker ${breakerKey} opened due to ${breaker.failureCount} failures`) } } else if (breaker.state === CircuitBreakerState.HALF_OPEN) { breaker.state = CircuitBreakerState.OPEN console.log(`❌ Circuit breaker ${breakerKey} re-opened after failed recovery attempt`) } } private async checkRateLimit(provider: AIProvider): Promise<{ allowed: boolean; waitTime?: number }> { const limiter = this.rateLimiters.get(provider) if (!limiter) return { allowed: true } const now = Date.now() // Reset if time window has passed if (now >= limiter.resetTime) { limiter.requestsRemaining = this.config.rateLimit.maxRequestsPerSecond limiter.resetTime = now + 1000 limiter.isLimited = false } if (limiter.requestsRemaining <= 0) { limiter.isLimited = true return { allowed: false, waitTime: limiter.resetTime - now } } limiter.requestsRemaining-- return { allowed: true } } private analyzeError( error: any, operation: string, provider?: AIProvider, retryCount: number = 0 ): ErrorInfo { const errorMessage = error?.message || String(error) const errorCode = error?.code || error?.status let category = ErrorCategory.PERMANENT let isRetryable = false let suggestedAction = 'Review error and fix manually' // Analyze error to determine category and retry strategy if (errorMessage.toLowerCase().includes('timeout') || errorMessage.toLowerCase().includes('network')) { category = ErrorCategory.TRANSIENT isRetryable = true suggestedAction = 'Retry with exponential backoff' } else if (errorMessage.toLowerCase().includes('rate limit') || errorCode === 429) { category = ErrorCategory.RATE_LIMIT isRetryable = true suggestedAction = 'Wait and retry, consider implementing rate limiting' } else if (errorMessage.toLowerCase().includes('quota') || errorMessage.toLowerCase().includes('exceeded')) { category = ErrorCategory.QUOTA_EXCEEDED isRetryable = false suggestedAction = 'Check API quota and billing' } else if (errorMessage.toLowerCase().includes('auth') || errorMessage.toLowerCase().includes('unauthorized') || errorCode === 401) { category = ErrorCategory.AUTHENTICATION isRetryable = false suggestedAction = 'Check API keys and authentication' } else if (errorCode >= 400 && errorCode < 500) { category = ErrorCategory.INVALID_REQUEST isRetryable = false suggestedAction = 'Review request parameters' } else if (errorCode >= 500) { category = ErrorCategory.SERVICE_ERROR isRetryable = true suggestedAction = 'Retry or use fallback provider' } return { category, code: String(errorCode || 'unknown'), message: errorMessage, provider, operation, timestamp: Date.now(), retryCount, isRetryable, suggestedAction, context: { originalError: error } } } private shouldRetry(error: ErrorInfo, attemptNumber: number): boolean { if (attemptNumber >= this.config.retryPolicy.maxAttempts) { return false } return error.isRetryable && [ ErrorCategory.TRANSIENT, ErrorCategory.RATE_LIMIT, ErrorCategory.SERVICE_ERROR, ErrorCategory.NETWORK ].includes(error.category) } private calculateRetryDelay(attemptNumber: number): number { const baseDelay = this.config.retryPolicy.baseDelayMs const maxDelay = this.config.retryPolicy.maxDelayMs const multiplier = this.config.retryPolicy.backoffMultiplier let delay = baseDelay * Math.pow(multiplier, attemptNumber) delay = Math.min(delay, maxDelay) // Add jitter if enabled if (this.config.retryPolicy.jitterEnabled) { const jitter = delay * 0.1 * Math.random() delay += jitter } return Math.floor(delay) } private async tryFallbackProviders( operation: () => Promise, context: any, startTime: number, existingAttempts: RetryAttempt[] ): Promise | null> { if (!this.config.fallback.enabled || !context.provider) { return null } const fallbackProviders = this.config.fallback.fallbackProviders.filter( p => p !== context.provider ) for (const fallbackProvider of fallbackProviders) { try { console.log(`🔄 Attempting fallback to provider: ${fallbackProvider}`) const result = await operation() // Note: In real implementation, this would use the fallback provider return { success: true, data: result, attempts: existingAttempts, totalDuration: Date.now() - startTime, finalProvider: fallbackProvider } } catch (error) { console.log(`❌ Fallback provider ${fallbackProvider} also failed:`, error) } } return null } private recordError(error: ErrorInfo): void { this.errorHistory.push(error) // Maintain history size limit if (this.errorHistory.length > this.config.monitoring.maxErrorHistorySize) { this.errorHistory = this.errorHistory.slice(-this.config.monitoring.maxErrorHistorySize) } // Alert on error patterns if enabled if (this.config.monitoring.alertOnPatterns) { this.checkErrorPatterns(error) } } private checkErrorPatterns(error: ErrorInfo): void { const recentErrors = this.errorHistory.filter( e => Date.now() - e.timestamp < 300000 // Last 5 minutes ) // Check for repeated errors from same provider if (error.provider) { const providerErrors = recentErrors.filter(e => e.provider === error.provider) if (providerErrors.length >= 5) { console.log(`🚨 High error rate detected for provider ${error.provider}: ${providerErrors.length} errors in 5 minutes`) } } // Check for repeated error categories const categoryErrors = recentErrors.filter(e => e.category === error.category) if (categoryErrors.length >= 10) { console.log(`🚨 High error rate detected for category ${error.category}: ${categoryErrors.length} errors in 5 minutes`) } } private analyzeErrorPatterns(errors: ErrorInfo[]): Array<{ pattern: string; count: number; lastSeen: number }> { const patterns = new Map() errors.forEach(error => { const pattern = `${error.category}:${error.provider || 'unknown'}` const existing = patterns.get(pattern) || { count: 0, lastSeen: 0 } patterns.set(pattern, { count: existing.count + 1, lastSeen: Math.max(existing.lastSeen, error.timestamp) }) }) return Array.from(patterns.entries()) .map(([pattern, data]) => ({ pattern, ...data })) .sort((a, b) => b.count - a.count) } private async executeConcurrently(promises: Promise[], concurrency: number): Promise { const results: T[] = [] const executing: Promise[] = [] for (const promise of promises) { const p = promise.then(result => { results.push(result) }) executing.push(p) if (executing.length >= concurrency) { await Promise.race(executing) executing.splice(executing.findIndex(x => x === p), 1) } } await Promise.all(executing) return results } private createError( category: ErrorCategory, message: string, operation: string, provider?: AIProvider ): ErrorInfo { return { category, message, operation, provider, timestamp: Date.now(), retryCount: 0, isRetryable: category !== ErrorCategory.PERMANENT } } private sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)) } } // Default configuration export const defaultErrorHandlingConfig: ErrorHandlingConfig = { retryPolicy: { maxAttempts: 3, baseDelayMs: 1000, // 1 second maxDelayMs: 30000, // 30 seconds backoffMultiplier: 2, jitterEnabled: true }, circuitBreaker: { failureThreshold: 5, recoveryTimeoutMs: 60000, // 1 minute halfOpenMaxCalls: 3, monitoringWindowMs: 300000 // 5 minutes }, rateLimit: { maxRequestsPerSecond: 10, burstSize: 5, enabled: true }, fallback: { enabled: true, fallbackProviders: ['openai', 'google', 'baidu'], gracefulDegradation: true }, monitoring: { enableMetrics: true, alertOnPatterns: true, maxErrorHistorySize: 1000 } }