Files
akmon/uni_modules/ak-ai-news/services/AIErrorHandler.uts
2026-01-20 08:04:15 +08:00

762 lines
22 KiB
Plaintext

// Advanced Error Handling and Retry Mechanism System
// Comprehensive error recovery, circuit breaker, and resilience patterns
import { type AIProvider, type AIResponse } from '../types/ai-types.uts'
/**
* Error classification and handling configuration
*/
export type ErrorHandlingConfig = {
retryPolicy: {
maxAttempts: number
baseDelayMs: number
maxDelayMs: number
backoffMultiplier: number
jitterEnabled: boolean
}
circuitBreaker: {
failureThreshold: number
recoveryTimeoutMs: number
halfOpenMaxCalls: number
monitoringWindowMs: number
}
rateLimit: {
maxRequestsPerSecond: number
burstSize: number
enabled: boolean
}
fallback: {
enabled: boolean
fallbackProviders: AIProvider[]
gracefulDegradation: boolean
}
monitoring: {
enableMetrics: boolean
alertOnPatterns: boolean
maxErrorHistorySize: number
}
}
/**
* Error categories for different handling strategies
*/
export enum ErrorCategory {
TRANSIENT = 'transient', // Network timeouts, temporary unavailability
AUTHENTICATION = 'auth', // API key issues, token expiration
RATE_LIMIT = 'rate_limit', // API rate limiting
QUOTA_EXCEEDED = 'quota', // API quota exceeded
INVALID_REQUEST = 'invalid', // Bad request data
SERVICE_ERROR = 'service', // Internal service errors
NETWORK = 'network', // Network connectivity issues
PERMANENT = 'permanent' // Permanent failures that shouldn't be retried
}
/**
* Detailed error information
*/
export type ErrorInfo = {
category: ErrorCategory
code?: string
message: string
provider?: AIProvider
operation: string
timestamp: number
retryCount: number
context?: Record<string, any>
isRetryable: boolean
suggestedAction?: string
}
/**
* Circuit breaker states
*/
export enum CircuitBreakerState {
CLOSED = 'closed', // Normal operation
OPEN = 'open', // Circuit is open, failing fast
HALF_OPEN = 'half_open' // Testing if service has recovered
}
/**
* Circuit breaker status
*/
export type CircuitBreakerStatus = {
state: CircuitBreakerState
failureCount: number
successCount: number
lastFailureTime?: number
nextAttemptTime?: number
halfOpenAttempts: number
}
/**
* Rate limiter status
*/
export type RateLimiterStatus = {
requestsRemaining: number
resetTime: number
isLimited: boolean
queueSize: number
}
/**
* Retry attempt information
*/
export type RetryAttempt = {
attemptNumber: number
timestamp: number
error?: ErrorInfo
delayMs: number
success: boolean
}
/**
* Operation result with retry information
*/
export type OperationResult<T> = {
success: boolean
data?: T
error?: ErrorInfo
attempts: RetryAttempt[]
totalDuration: number
finalProvider?: AIProvider
}
/**
* Advanced error handler and retry manager
*/
export class AIErrorHandler {
private config: ErrorHandlingConfig
private circuitBreakers = new Map<string, CircuitBreakerStatus>()
private rateLimiters = new Map<string, RateLimiterStatus>()
private errorHistory: ErrorInfo[] = []
private requestQueues = new Map<string, Array<() => Promise<any>>>()
constructor(config: ErrorHandlingConfig) {
this.config = config
this.initializeCircuitBreakers()
this.initializeRateLimiters()
}
/**
* Execute operation with advanced error handling and retry logic
*/
async executeWithRetry<T>(
operation: () => Promise<T>,
context: {
operationName: string
provider?: AIProvider
retryable?: boolean
metadata?: Record<string, any>
}
): Promise<OperationResult<T>> {
const startTime = Date.now()
const attempts: RetryAttempt[] = []
let lastError: ErrorInfo | undefined
// Check circuit breaker
const breakerKey = this.getBreakerKey(context.operationName, context.provider)
if (this.isCircuitOpen(breakerKey)) {
const error = this.createError(
ErrorCategory.SERVICE_ERROR,
`Circuit breaker is open for ${breakerKey}`,
context.operationName,
context.provider
)
return {
success: false,
error,
attempts: [],
totalDuration: Date.now() - startTime
}
}
// Check rate limits
if (this.config.rateLimit.enabled && context.provider) {
const rateLimitResult = await this.checkRateLimit(context.provider)
if (!rateLimitResult.allowed) {
const error = this.createError(
ErrorCategory.RATE_LIMIT,
'Rate limit exceeded',
context.operationName,
context.provider
)
return {
success: false,
error,
attempts: [],
totalDuration: Date.now() - startTime
}
}
}
// Execute with retry logic
for (let attempt = 1; attempt <= this.config.retryPolicy.maxAttempts; attempt++) {
const attemptStart = Date.now()
try {
// Add delay for retry attempts
if (attempt > 1) {
const delay = this.calculateRetryDelay(attempt - 1)
await this.sleep(delay)
attempts[attempts.length - 1].delayMs = delay
}
// Execute the operation
const result = await operation()
// Record successful attempt
const attemptInfo: RetryAttempt = {
attemptNumber: attempt,
timestamp: attemptStart,
delayMs: 0,
success: true
}
attempts.push(attemptInfo)
// Update circuit breaker on success
this.recordSuccess(breakerKey)
return {
success: true,
data: result,
attempts,
totalDuration: Date.now() - startTime,
finalProvider: context.provider
}
} catch (error) {
const errorInfo = this.analyzeError(error, context.operationName, context.provider, attempt - 1)
lastError = errorInfo
// Record failed attempt
const attemptInfo: RetryAttempt = {
attemptNumber: attempt,
timestamp: attemptStart,
error: errorInfo,
delayMs: 0,
success: false
}
attempts.push(attemptInfo)
// Update error history
this.recordError(errorInfo)
// Update circuit breaker on failure
this.recordFailure(breakerKey)
// Check if we should retry
if (!this.shouldRetry(errorInfo, attempt)) {
break
}
// Try fallback provider if available
if (this.config.fallback.enabled && attempt === this.config.retryPolicy.maxAttempts) {
const fallbackResult = await this.tryFallbackProviders(
operation,
context,
startTime,
attempts
)
if (fallbackResult) {
return fallbackResult
}
}
}
}
return {
success: false,
error: lastError,
attempts,
totalDuration: Date.now() - startTime
}
}
/**
* Handle bulk operations with advanced error recovery
*/
async executeBulkWithRetry<T, R>(
items: T[],
operation: (item: T) => Promise<R>,
options: {
operationName: string
batchSize?: number
concurrency?: number
failFast?: boolean
partialFailureThreshold?: number
}
): Promise<{
results: Array<{ item: T; result?: R; error?: ErrorInfo }>
summary: {
successful: number
failed: number
totalTime: number
throughput: number
}
}> {
const startTime = Date.now()
const batchSize = options.batchSize || 10
const concurrency = options.concurrency || 3
const results: Array<{ item: T; result?: R; error?: ErrorInfo }> = []
// Process in batches
for (let i = 0; i < items.length; i += batchSize) {
const batch = items.slice(i, i + batchSize)
// Process batch with controlled concurrency
const batchPromises = batch.map(async (item) => {
const operationResult = await this.executeWithRetry(
() => operation(item),
{
operationName: options.operationName,
metadata: { batchIndex: Math.floor(i / batchSize), itemIndex: i + batch.indexOf(item) }
}
)
return {
item,
result: operationResult.data,
error: operationResult.error
}
})
// Execute with concurrency control
const batchResults = await this.executeConcurrently(batchPromises, concurrency)
results.push(...batchResults)
// Check failure threshold
const failedCount = results.filter(r => r.error).length
const failureRate = failedCount / results.length
if (options.failFast && failureRate > (options.partialFailureThreshold || 0.5)) {
console.log(`⚠️ Bulk operation failing fast due to high failure rate: ${(failureRate * 100).toFixed(1)}%`)
break
}
}
const endTime = Date.now()
const successful = results.filter(r => !r.error).length
const failed = results.filter(r => r.error).length
const totalTime = endTime - startTime
const throughput = results.length / (totalTime / 1000)
return {
results,
summary: {
successful,
failed,
totalTime,
throughput
}
}
}
/**
* Get current error handling status
*/
getErrorHandlingStatus(): {
circuitBreakers: Array<{ key: string; status: CircuitBreakerStatus }>
rateLimiters: Array<{ key: string; status: RateLimiterStatus }>
recentErrors: ErrorInfo[]
errorPatterns: Array<{ pattern: string; count: number; lastSeen: number }>
} {
const recentErrors = this.errorHistory.slice(-50) // Last 50 errors
const errorPatterns = this.analyzeErrorPatterns(recentErrors)
return {
circuitBreakers: Array.from(this.circuitBreakers.entries()).map(([key, status]) => ({ key, status })),
rateLimiters: Array.from(this.rateLimiters.entries()).map(([key, status]) => ({ key, status })),
recentErrors,
errorPatterns
}
}
/**
* Reset circuit breakers and error state
*/
resetErrorState(): void {
this.circuitBreakers.clear()
this.rateLimiters.clear()
this.errorHistory = []
this.requestQueues.clear()
this.initializeCircuitBreakers()
this.initializeRateLimiters()
console.log('🔄 Error handling state reset')
}
/**
* Update configuration
*/
updateConfig(newConfig: Partial<ErrorHandlingConfig>): void {
this.config = { ...this.config, ...newConfig }
console.log('⚙️ Error handling configuration updated')
}
// Private methods
private initializeCircuitBreakers(): void {
const providers: AIProvider[] = ['openai', 'google', 'baidu']
const operations = ['translate', 'analyze', 'chat', 'recommend']
providers.forEach(provider => {
operations.forEach(operation => {
const key = this.getBreakerKey(operation, provider)
this.circuitBreakers.set(key, {
state: CircuitBreakerState.CLOSED,
failureCount: 0,
successCount: 0,
halfOpenAttempts: 0
})
})
})
}
private initializeRateLimiters(): void {
const providers: AIProvider[] = ['openai', 'google', 'baidu']
providers.forEach(provider => {
this.rateLimiters.set(provider, {
requestsRemaining: this.config.rateLimit.maxRequestsPerSecond,
resetTime: Date.now() + 1000,
isLimited: false,
queueSize: 0
})
})
}
private getBreakerKey(operation: string, provider?: AIProvider): string {
return provider ? `${provider}:${operation}` : operation
}
private isCircuitOpen(breakerKey: string): boolean {
const breaker = this.circuitBreakers.get(breakerKey)
if (!breaker) return false
if (breaker.state === CircuitBreakerState.OPEN) {
// Check if we should transition to half-open
const now = Date.now()
if (breaker.lastFailureTime &&
now - breaker.lastFailureTime > this.config.circuitBreaker.recoveryTimeoutMs) {
breaker.state = CircuitBreakerState.HALF_OPEN
breaker.halfOpenAttempts = 0
console.log(`🔄 Circuit breaker ${breakerKey} transitioning to half-open`)
return false
}
return true
}
return false
}
private recordSuccess(breakerKey: string): void {
const breaker = this.circuitBreakers.get(breakerKey)
if (!breaker) return
breaker.successCount++
if (breaker.state === CircuitBreakerState.HALF_OPEN) {
breaker.halfOpenAttempts++
if (breaker.halfOpenAttempts >= this.config.circuitBreaker.halfOpenMaxCalls) {
breaker.state = CircuitBreakerState.CLOSED
breaker.failureCount = 0
console.log(`✅ Circuit breaker ${breakerKey} closed after successful recovery`)
}
}
}
private recordFailure(breakerKey: string): void {
const breaker = this.circuitBreakers.get(breakerKey)
if (!breaker) return
breaker.failureCount++
breaker.lastFailureTime = Date.now()
if (breaker.state === CircuitBreakerState.CLOSED) {
if (breaker.failureCount >= this.config.circuitBreaker.failureThreshold) {
breaker.state = CircuitBreakerState.OPEN
console.log(`⚠️ Circuit breaker ${breakerKey} opened due to ${breaker.failureCount} failures`)
}
} else if (breaker.state === CircuitBreakerState.HALF_OPEN) {
breaker.state = CircuitBreakerState.OPEN
console.log(`❌ Circuit breaker ${breakerKey} re-opened after failed recovery attempt`)
}
}
private async checkRateLimit(provider: AIProvider): Promise<{ allowed: boolean; waitTime?: number }> {
const limiter = this.rateLimiters.get(provider)
if (!limiter) return { allowed: true }
const now = Date.now()
// Reset if time window has passed
if (now >= limiter.resetTime) {
limiter.requestsRemaining = this.config.rateLimit.maxRequestsPerSecond
limiter.resetTime = now + 1000
limiter.isLimited = false
}
if (limiter.requestsRemaining <= 0) {
limiter.isLimited = true
return {
allowed: false,
waitTime: limiter.resetTime - now
}
}
limiter.requestsRemaining--
return { allowed: true }
}
private analyzeError(
error: any,
operation: string,
provider?: AIProvider,
retryCount: number = 0
): ErrorInfo {
const errorMessage = error?.message || String(error)
const errorCode = error?.code || error?.status
let category = ErrorCategory.PERMANENT
let isRetryable = false
let suggestedAction = 'Review error and fix manually'
// Analyze error to determine category and retry strategy
if (errorMessage.toLowerCase().includes('timeout') ||
errorMessage.toLowerCase().includes('network')) {
category = ErrorCategory.TRANSIENT
isRetryable = true
suggestedAction = 'Retry with exponential backoff'
} else if (errorMessage.toLowerCase().includes('rate limit') || errorCode === 429) {
category = ErrorCategory.RATE_LIMIT
isRetryable = true
suggestedAction = 'Wait and retry, consider implementing rate limiting'
} else if (errorMessage.toLowerCase().includes('quota') ||
errorMessage.toLowerCase().includes('exceeded')) {
category = ErrorCategory.QUOTA_EXCEEDED
isRetryable = false
suggestedAction = 'Check API quota and billing'
} else if (errorMessage.toLowerCase().includes('auth') ||
errorMessage.toLowerCase().includes('unauthorized') ||
errorCode === 401) {
category = ErrorCategory.AUTHENTICATION
isRetryable = false
suggestedAction = 'Check API keys and authentication'
} else if (errorCode >= 400 && errorCode < 500) {
category = ErrorCategory.INVALID_REQUEST
isRetryable = false
suggestedAction = 'Review request parameters'
} else if (errorCode >= 500) {
category = ErrorCategory.SERVICE_ERROR
isRetryable = true
suggestedAction = 'Retry or use fallback provider'
}
return {
category,
code: String(errorCode || 'unknown'),
message: errorMessage,
provider,
operation,
timestamp: Date.now(),
retryCount,
isRetryable,
suggestedAction,
context: {
originalError: error
}
}
}
private shouldRetry(error: ErrorInfo, attemptNumber: number): boolean {
if (attemptNumber >= this.config.retryPolicy.maxAttempts) {
return false
}
return error.isRetryable && [
ErrorCategory.TRANSIENT,
ErrorCategory.RATE_LIMIT,
ErrorCategory.SERVICE_ERROR,
ErrorCategory.NETWORK
].includes(error.category)
}
private calculateRetryDelay(attemptNumber: number): number {
const baseDelay = this.config.retryPolicy.baseDelayMs
const maxDelay = this.config.retryPolicy.maxDelayMs
const multiplier = this.config.retryPolicy.backoffMultiplier
let delay = baseDelay * Math.pow(multiplier, attemptNumber)
delay = Math.min(delay, maxDelay)
// Add jitter if enabled
if (this.config.retryPolicy.jitterEnabled) {
const jitter = delay * 0.1 * Math.random()
delay += jitter
}
return Math.floor(delay)
}
private async tryFallbackProviders<T>(
operation: () => Promise<T>,
context: any,
startTime: number,
existingAttempts: RetryAttempt[]
): Promise<OperationResult<T> | null> {
if (!this.config.fallback.enabled || !context.provider) {
return null
}
const fallbackProviders = this.config.fallback.fallbackProviders.filter(
p => p !== context.provider
)
for (const fallbackProvider of fallbackProviders) {
try {
console.log(`🔄 Attempting fallback to provider: ${fallbackProvider}`)
const result = await operation() // Note: In real implementation, this would use the fallback provider
return {
success: true,
data: result,
attempts: existingAttempts,
totalDuration: Date.now() - startTime,
finalProvider: fallbackProvider
}
} catch (error) {
console.log(`❌ Fallback provider ${fallbackProvider} also failed:`, error)
}
}
return null
}
private recordError(error: ErrorInfo): void {
this.errorHistory.push(error)
// Maintain history size limit
if (this.errorHistory.length > this.config.monitoring.maxErrorHistorySize) {
this.errorHistory = this.errorHistory.slice(-this.config.monitoring.maxErrorHistorySize)
}
// Alert on error patterns if enabled
if (this.config.monitoring.alertOnPatterns) {
this.checkErrorPatterns(error)
}
}
private checkErrorPatterns(error: ErrorInfo): void {
const recentErrors = this.errorHistory.filter(
e => Date.now() - e.timestamp < 300000 // Last 5 minutes
)
// Check for repeated errors from same provider
if (error.provider) {
const providerErrors = recentErrors.filter(e => e.provider === error.provider)
if (providerErrors.length >= 5) {
console.log(`🚨 High error rate detected for provider ${error.provider}: ${providerErrors.length} errors in 5 minutes`)
}
}
// Check for repeated error categories
const categoryErrors = recentErrors.filter(e => e.category === error.category)
if (categoryErrors.length >= 10) {
console.log(`🚨 High error rate detected for category ${error.category}: ${categoryErrors.length} errors in 5 minutes`)
}
}
private analyzeErrorPatterns(errors: ErrorInfo[]): Array<{ pattern: string; count: number; lastSeen: number }> {
const patterns = new Map<string, { count: number; lastSeen: number }>()
errors.forEach(error => {
const pattern = `${error.category}:${error.provider || 'unknown'}`
const existing = patterns.get(pattern) || { count: 0, lastSeen: 0 }
patterns.set(pattern, {
count: existing.count + 1,
lastSeen: Math.max(existing.lastSeen, error.timestamp)
})
})
return Array.from(patterns.entries())
.map(([pattern, data]) => ({ pattern, ...data }))
.sort((a, b) => b.count - a.count)
}
private async executeConcurrently<T>(promises: Promise<T>[], concurrency: number): Promise<T[]> {
const results: T[] = []
const executing: Promise<void>[] = []
for (const promise of promises) {
const p = promise.then(result => {
results.push(result)
})
executing.push(p)
if (executing.length >= concurrency) {
await Promise.race(executing)
executing.splice(executing.findIndex(x => x === p), 1)
}
}
await Promise.all(executing)
return results
}
private createError(
category: ErrorCategory,
message: string,
operation: string,
provider?: AIProvider
): ErrorInfo {
return {
category,
message,
operation,
provider,
timestamp: Date.now(),
retryCount: 0,
isRetryable: category !== ErrorCategory.PERMANENT
}
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms))
}
}
// Default configuration
export const defaultErrorHandlingConfig: ErrorHandlingConfig = {
retryPolicy: {
maxAttempts: 3,
baseDelayMs: 1000, // 1 second
maxDelayMs: 30000, // 30 seconds
backoffMultiplier: 2,
jitterEnabled: true
},
circuitBreaker: {
failureThreshold: 5,
recoveryTimeoutMs: 60000, // 1 minute
halfOpenMaxCalls: 3,
monitoringWindowMs: 300000 // 5 minutes
},
rateLimit: {
maxRequestsPerSecond: 10,
burstSize: 5,
enabled: true
},
fallback: {
enabled: true,
fallbackProviders: ['openai', 'google', 'baidu'],
gracefulDegradation: true
},
monitoring: {
enableMetrics: true,
alertOnPatterns: true,
maxErrorHistorySize: 1000
}
}