akmon/uni_modules/ak-ai-news/services/AIErrorHandler.uts

// Advanced Error Handling and Retry Mechanism System
// Comprehensive error recovery, circuit breaker, and resilience patterns

import { type AIProvider, type AIResponse } from '../types/ai-types.uts'

/**
 * Error classification and handling configuration
 */
export type ErrorHandlingConfig = {
  retryPolicy: {
    maxAttempts: number
    baseDelayMs: number
    maxDelayMs: number
    backoffMultiplier: number
    jitterEnabled: boolean
  }
  circuitBreaker: {
    failureThreshold: number
    recoveryTimeoutMs: number
    halfOpenMaxCalls: number
    monitoringWindowMs: number
  }
  rateLimit: {
    maxRequestsPerSecond: number
    burstSize: number
    enabled: boolean
  }
  fallback: {
    enabled: boolean
    fallbackProviders: AIProvider[]
    gracefulDegradation: boolean
  }
  monitoring: {
    enableMetrics: boolean
    alertOnPatterns: boolean
    maxErrorHistorySize: number
  }
}

/**
 * Error categories for different handling strategies
 */
export enum ErrorCategory {
  TRANSIENT = 'transient',        // Network timeouts, temporary unavailability
  AUTHENTICATION = 'auth',        // API key issues, token expiration
  RATE_LIMIT = 'rate_limit',     // API rate limiting
  QUOTA_EXCEEDED = 'quota',      // API quota exceeded
  INVALID_REQUEST = 'invalid',   // Bad request data
  SERVICE_ERROR = 'service',     // Internal service errors
  NETWORK = 'network',           // Network connectivity issues
  PERMANENT = 'permanent'        // Permanent failures that shouldn't be retried
}

/**
 * Detailed error information
 */
export type ErrorInfo = {
  category: ErrorCategory
  code?: string
  message: string
  provider?: AIProvider
  operation: string
  timestamp: number
  retryCount: number
  context?: Record<string, any>
  isRetryable: boolean
  suggestedAction?: string
}

/**
 * Circuit breaker states
 */
export enum CircuitBreakerState {
  CLOSED = 'closed',      // Normal operation
  OPEN = 'open',          // Circuit is open, failing fast
  HALF_OPEN = 'half_open' // Testing if service has recovered
}

/**
 * Circuit breaker status
 */
export type CircuitBreakerStatus = {
  state: CircuitBreakerState
  failureCount: number
  successCount: number
  lastFailureTime?: number
  nextAttemptTime?: number
  halfOpenAttempts: number
}

/**
 * Rate limiter status
 */
export type RateLimiterStatus = {
  requestsRemaining: number
  resetTime: number
  isLimited: boolean
  queueSize: number
}

/**
 * Retry attempt information
 */
export type RetryAttempt = {
  attemptNumber: number
  timestamp: number
  error?: ErrorInfo
  delayMs: number
  success: boolean
}

/**
 * Operation result with retry information
 */
export type OperationResult<T> = {
  success: boolean
  data?: T
  error?: ErrorInfo
  attempts: RetryAttempt[]
  totalDuration: number
  finalProvider?: AIProvider
}

/**
 * Advanced error handler and retry manager
 */
export class AIErrorHandler {
  private config: ErrorHandlingConfig
  private circuitBreakers = new Map<string, CircuitBreakerStatus>()
  private rateLimiters = new Map<string, RateLimiterStatus>()
  private errorHistory: ErrorInfo[] = []
  private requestQueues = new Map<string, Array<() => Promise<any>>>()

  constructor(config: ErrorHandlingConfig) {
    this.config = config
    this.initializeCircuitBreakers()
    this.initializeRateLimiters()
  }

  /**
   * Execute operation with advanced error handling and retry logic
   */
  async executeWithRetry<T>(
    operation: () => Promise<T>,
    context: {
      operationName: string
      provider?: AIProvider
      retryable?: boolean
      metadata?: Record<string, any>
    }
  ): Promise<OperationResult<T>> {
    const startTime = Date.now()
    const attempts: RetryAttempt[] = []
    let lastError: ErrorInfo | undefined

    // Check circuit breaker
    const breakerKey = this.getBreakerKey(context.operationName, context.provider)
    if (this.isCircuitOpen(breakerKey)) {
      const error = this.createError(
        ErrorCategory.SERVICE_ERROR,
        `Circuit breaker is open for ${breakerKey}`,
        context.operationName,
        context.provider
      )
      return {
        success: false,
        error,
        attempts: [],
        totalDuration: Date.now() - startTime
      }
    }

    // Check rate limits
    if (this.config.rateLimit.enabled && context.provider) {
      const rateLimitResult = await this.checkRateLimit(context.provider)
      if (!rateLimitResult.allowed) {
        const error = this.createError(
          ErrorCategory.RATE_LIMIT,
          'Rate limit exceeded',
          context.operationName,
          context.provider
        )
        return {
          success: false,
          error,
          attempts: [],
          totalDuration: Date.now() - startTime
        }
      }
    }

    // Execute with retry logic
    for (let attempt = 1; attempt <= this.config.retryPolicy.maxAttempts; attempt++) {
      const attemptStart = Date.now()

      try {
        // Add delay for retry attempts
        if (attempt > 1) {
          const delay = this.calculateRetryDelay(attempt - 1)
          await this.sleep(delay)
          attempts[attempts.length - 1].delayMs = delay
        }

        // Execute the operation
        const result = await operation()

        // Record successful attempt
        const attemptInfo: RetryAttempt = {
          attemptNumber: attempt,
          timestamp: attemptStart,
          delayMs: 0,
          success: true
        }
        attempts.push(attemptInfo)

        // Update circuit breaker on success
        this.recordSuccess(breakerKey)

        return {
          success: true,
          data: result,
          attempts,
          totalDuration: Date.now() - startTime,
          finalProvider: context.provider
        }

      } catch (error) {
        const errorInfo = this.analyzeError(error, context.operationName, context.provider, attempt - 1)
        lastError = errorInfo

        // Record failed attempt
        const attemptInfo: RetryAttempt = {
          attemptNumber: attempt,
          timestamp: attemptStart,
          error: errorInfo,
          delayMs: 0,
          success: false
        }
        attempts.push(attemptInfo)

        // Update error history
        this.recordError(errorInfo)

        // Update circuit breaker on failure
        this.recordFailure(breakerKey)

        // Check if we should retry
        if (!this.shouldRetry(errorInfo, attempt)) {
          break
        }

        // Try fallback provider if available
        if (this.config.fallback.enabled && attempt === this.config.retryPolicy.maxAttempts) {
          const fallbackResult = await this.tryFallbackProviders(
            operation,
            context,
            startTime,
            attempts
          )
          if (fallbackResult) {
            return fallbackResult
          }
        }
      }
    }

    return {
      success: false,
      error: lastError,
      attempts,
      totalDuration: Date.now() - startTime
    }
  }

  /**
   * Handle bulk operations with advanced error recovery
   */
  async executeBulkWithRetry<T, R>(
    items: T[],
    operation: (item: T) => Promise<R>,
    options: {
      operationName: string
      batchSize?: number
      concurrency?: number
      failFast?: boolean
      partialFailureThreshold?: number
    }
  ): Promise<{
    results: Array<{ item: T; result?: R; error?: ErrorInfo }>
    summary: {
      successful: number
      failed: number
      totalTime: number
      throughput: number
    }
  }> {
    const startTime = Date.now()
    const batchSize = options.batchSize || 10
    const concurrency = options.concurrency || 3
    const results: Array<{ item: T; result?: R; error?: ErrorInfo }> = []

    // Process in batches
    for (let i = 0; i < items.length; i += batchSize) {
      const batch = items.slice(i, i + batchSize)

      // Process batch with controlled concurrency
      const batchPromises = batch.map(async (item) => {
        const operationResult = await this.executeWithRetry(
          () => operation(item),
          {
            operationName: options.operationName,
            metadata: { batchIndex: Math.floor(i / batchSize), itemIndex: i + batch.indexOf(item) }
          }
        )

        return {
          item,
          result: operationResult.data,
          error: operationResult.error
        }
      })

      // Execute with concurrency control
      const batchResults = await this.executeConcurrently(batchPromises, concurrency)
      results.push(...batchResults)

      // Check failure threshold
      const failedCount = results.filter(r => r.error).length
      const failureRate = failedCount / results.length

      if (options.failFast && failureRate > (options.partialFailureThreshold || 0.5)) {
        console.log(`⚠️ Bulk operation failing fast due to high failure rate: ${(failureRate * 100).toFixed(1)}%`)
        break
      }
    }

    const endTime = Date.now()
    const successful = results.filter(r => !r.error).length
    const failed = results.filter(r => r.error).length
    const totalTime = endTime - startTime
    const throughput = results.length / (totalTime / 1000)

    return {
      results,
      summary: {
        successful,
        failed,
        totalTime,
        throughput
      }
    }
  }

  /**
   * Get current error handling status
   */
  getErrorHandlingStatus(): {
    circuitBreakers: Array<{ key: string; status: CircuitBreakerStatus }>
    rateLimiters: Array<{ key: string; status: RateLimiterStatus }>
    recentErrors: ErrorInfo[]
    errorPatterns: Array<{ pattern: string; count: number; lastSeen: number }>
  } {
    const recentErrors = this.errorHistory.slice(-50) // Last 50 errors
    const errorPatterns = this.analyzeErrorPatterns(recentErrors)

    return {
      circuitBreakers: Array.from(this.circuitBreakers.entries()).map(([key, status]) => ({ key, status })),
      rateLimiters: Array.from(this.rateLimiters.entries()).map(([key, status]) => ({ key, status })),
      recentErrors,
      errorPatterns
    }
  }

  /**
   * Reset circuit breakers and error state
   */
  resetErrorState(): void {
    this.circuitBreakers.clear()
    this.rateLimiters.clear()
    this.errorHistory = []
    this.requestQueues.clear()

    this.initializeCircuitBreakers()
    this.initializeRateLimiters()

    console.log('🔄 Error handling state reset')
  }

  /**
   * Update configuration
   */
  updateConfig(newConfig: Partial<ErrorHandlingConfig>): void {
    this.config = { ...this.config, ...newConfig }
    console.log('⚙️ Error handling configuration updated')
  }

  // Private methods

  private initializeCircuitBreakers(): void {
    const providers: AIProvider[] = ['openai', 'google', 'baidu']
    const operations = ['translate', 'analyze', 'chat', 'recommend']

    providers.forEach(provider => {
      operations.forEach(operation => {
        const key = this.getBreakerKey(operation, provider)
        this.circuitBreakers.set(key, {
          state: CircuitBreakerState.CLOSED,
          failureCount: 0,
          successCount: 0,
          halfOpenAttempts: 0
        })
      })
    })
  }

  private initializeRateLimiters(): void {
    const providers: AIProvider[] = ['openai', 'google', 'baidu']

    providers.forEach(provider => {
      this.rateLimiters.set(provider, {
        requestsRemaining: this.config.rateLimit.maxRequestsPerSecond,
        resetTime: Date.now() + 1000,
        isLimited: false,
        queueSize: 0
      })
    })
  }

  private getBreakerKey(operation: string, provider?: AIProvider): string {
    return provider ? `${provider}:${operation}` : operation
  }

  private isCircuitOpen(breakerKey: string): boolean {
    const breaker = this.circuitBreakers.get(breakerKey)
    if (!breaker) return false

    if (breaker.state === CircuitBreakerState.OPEN) {
      // Check if we should transition to half-open
      const now = Date.now()
      if (breaker.lastFailureTime &&
          now - breaker.lastFailureTime > this.config.circuitBreaker.recoveryTimeoutMs) {
        breaker.state = CircuitBreakerState.HALF_OPEN
        breaker.halfOpenAttempts = 0
        console.log(`🔄 Circuit breaker ${breakerKey} transitioning to half-open`)
        return false
      }
      return true
    }

    return false
  }

  private recordSuccess(breakerKey: string): void {
    const breaker = this.circuitBreakers.get(breakerKey)
    if (!breaker) return

    breaker.successCount++

    if (breaker.state === CircuitBreakerState.HALF_OPEN) {
      breaker.halfOpenAttempts++
      if (breaker.halfOpenAttempts >= this.config.circuitBreaker.halfOpenMaxCalls) {
        breaker.state = CircuitBreakerState.CLOSED
        breaker.failureCount = 0
        console.log(`✅ Circuit breaker ${breakerKey} closed after successful recovery`)
      }
    }
  }

  private recordFailure(breakerKey: string): void {
    const breaker = this.circuitBreakers.get(breakerKey)
    if (!breaker) return

    breaker.failureCount++
    breaker.lastFailureTime = Date.now()

    if (breaker.state === CircuitBreakerState.CLOSED) {
      if (breaker.failureCount >= this.config.circuitBreaker.failureThreshold) {
        breaker.state = CircuitBreakerState.OPEN
        console.log(`⚠️ Circuit breaker ${breakerKey} opened due to ${breaker.failureCount} failures`)
      }
    } else if (breaker.state === CircuitBreakerState.HALF_OPEN) {
      breaker.state = CircuitBreakerState.OPEN
      console.log(`❌ Circuit breaker ${breakerKey} re-opened after failed recovery attempt`)
    }
  }

  private async checkRateLimit(provider: AIProvider): Promise<{ allowed: boolean; waitTime?: number }> {
    const limiter = this.rateLimiters.get(provider)
    if (!limiter) return { allowed: true }

    const now = Date.now()

    // Reset if time window has passed
    if (now >= limiter.resetTime) {
      limiter.requestsRemaining = this.config.rateLimit.maxRequestsPerSecond
      limiter.resetTime = now + 1000
      limiter.isLimited = false
    }

    if (limiter.requestsRemaining <= 0) {
      limiter.isLimited = true
      return {
        allowed: false,
        waitTime: limiter.resetTime - now
      }
    }

    limiter.requestsRemaining--
    return { allowed: true }
  }

  private analyzeError(
    error: any,
    operation: string,
    provider?: AIProvider,
    retryCount: number = 0
  ): ErrorInfo {
    const errorMessage = error?.message || String(error)
    const errorCode = error?.code || error?.status

    let category = ErrorCategory.PERMANENT
    let isRetryable = false
    let suggestedAction = 'Review error and fix manually'

    // Analyze error to determine category and retry strategy
    if (errorMessage.toLowerCase().includes('timeout') ||
        errorMessage.toLowerCase().includes('network')) {
      category = ErrorCategory.TRANSIENT
      isRetryable = true
      suggestedAction = 'Retry with exponential backoff'
    } else if (errorMessage.toLowerCase().includes('rate limit') || errorCode === 429) {
      category = ErrorCategory.RATE_LIMIT
      isRetryable = true
      suggestedAction = 'Wait and retry, consider implementing rate limiting'
    } else if (errorMessage.toLowerCase().includes('quota') ||
               errorMessage.toLowerCase().includes('exceeded')) {
      category = ErrorCategory.QUOTA_EXCEEDED
      isRetryable = false
      suggestedAction = 'Check API quota and billing'
    } else if (errorMessage.toLowerCase().includes('auth') ||
               errorMessage.toLowerCase().includes('unauthorized') ||
               errorCode === 401) {
      category = ErrorCategory.AUTHENTICATION
      isRetryable = false
      suggestedAction = 'Check API keys and authentication'
    } else if (errorCode >= 400 && errorCode < 500) {
      category = ErrorCategory.INVALID_REQUEST
      isRetryable = false
      suggestedAction = 'Review request parameters'
    } else if (errorCode >= 500) {
      category = ErrorCategory.SERVICE_ERROR
      isRetryable = true
      suggestedAction = 'Retry or use fallback provider'
    }

    return {
      category,
      code: String(errorCode || 'unknown'),
      message: errorMessage,
      provider,
      operation,
      timestamp: Date.now(),
      retryCount,
      isRetryable,
      suggestedAction,
      context: {
        originalError: error
      }
    }
  }

  private shouldRetry(error: ErrorInfo, attemptNumber: number): boolean {
    if (attemptNumber >= this.config.retryPolicy.maxAttempts) {
      return false
    }

    return error.isRetryable && [
      ErrorCategory.TRANSIENT,
      ErrorCategory.RATE_LIMIT,
      ErrorCategory.SERVICE_ERROR,
      ErrorCategory.NETWORK
    ].includes(error.category)
  }

  private calculateRetryDelay(attemptNumber: number): number {
    const baseDelay = this.config.retryPolicy.baseDelayMs
    const maxDelay = this.config.retryPolicy.maxDelayMs
    const multiplier = this.config.retryPolicy.backoffMultiplier

    let delay = baseDelay * Math.pow(multiplier, attemptNumber)
    delay = Math.min(delay, maxDelay)

    // Add jitter if enabled
    if (this.config.retryPolicy.jitterEnabled) {
      const jitter = delay * 0.1 * Math.random()
      delay += jitter
    }

    return Math.floor(delay)
  }

  private async tryFallbackProviders<T>(
    operation: () => Promise<T>,
    context: any,
    startTime: number,
    existingAttempts: RetryAttempt[]
  ): Promise<OperationResult<T> | null> {
    if (!this.config.fallback.enabled || !context.provider) {
      return null
    }

    const fallbackProviders = this.config.fallback.fallbackProviders.filter(
      p => p !== context.provider
    )

    for (const fallbackProvider of fallbackProviders) {
      try {
        console.log(`🔄 Attempting fallback to provider: ${fallbackProvider}`)

        const result = await operation() // Note: In real implementation, this would use the fallback provider

        return {
          success: true,
          data: result,
          attempts: existingAttempts,
          totalDuration: Date.now() - startTime,
          finalProvider: fallbackProvider
        }
      } catch (error) {
        console.log(`❌ Fallback provider ${fallbackProvider} also failed:`, error)
      }
    }

    return null
  }

  private recordError(error: ErrorInfo): void {
    this.errorHistory.push(error)

    // Maintain history size limit
    if (this.errorHistory.length > this.config.monitoring.maxErrorHistorySize) {
      this.errorHistory = this.errorHistory.slice(-this.config.monitoring.maxErrorHistorySize)
    }

    // Alert on error patterns if enabled
    if (this.config.monitoring.alertOnPatterns) {
      this.checkErrorPatterns(error)
    }
  }

  private checkErrorPatterns(error: ErrorInfo): void {
    const recentErrors = this.errorHistory.filter(
      e => Date.now() - e.timestamp < 300000 // Last 5 minutes
    )

    // Check for repeated errors from same provider
    if (error.provider) {
      const providerErrors = recentErrors.filter(e => e.provider === error.provider)
      if (providerErrors.length >= 5) {
        console.log(`🚨 High error rate detected for provider ${error.provider}: ${providerErrors.length} errors in 5 minutes`)
      }
    }

    // Check for repeated error categories
    const categoryErrors = recentErrors.filter(e => e.category === error.category)
    if (categoryErrors.length >= 10) {
      console.log(`🚨 High error rate detected for category ${error.category}: ${categoryErrors.length} errors in 5 minutes`)
    }
  }

  private analyzeErrorPatterns(errors: ErrorInfo[]): Array<{ pattern: string; count: number; lastSeen: number }> {
    const patterns = new Map<string, { count: number; lastSeen: number }>()

    errors.forEach(error => {
      const pattern = `${error.category}:${error.provider || 'unknown'}`
      const existing = patterns.get(pattern) || { count: 0, lastSeen: 0 }
      patterns.set(pattern, {
        count: existing.count + 1,
        lastSeen: Math.max(existing.lastSeen, error.timestamp)
      })
    })

    return Array.from(patterns.entries())
      .map(([pattern, data]) => ({ pattern, ...data }))
      .sort((a, b) => b.count - a.count)
  }

  private async executeConcurrently<T>(promises: Promise<T>[], concurrency: number): Promise<T[]> {
    const results: T[] = []
    const executing: Promise<void>[] = []

    for (const promise of promises) {
      const p = promise.then(result => {
        results.push(result)
      })

      executing.push(p)

      if (executing.length >= concurrency) {
        await Promise.race(executing)
        executing.splice(executing.findIndex(x => x === p), 1)
      }
    }

    await Promise.all(executing)
    return results
  }

  private createError(
    category: ErrorCategory,
    message: string,
    operation: string,
    provider?: AIProvider
  ): ErrorInfo {
    return {
      category,
      message,
      operation,
      provider,
      timestamp: Date.now(),
      retryCount: 0,
      isRetryable: category !== ErrorCategory.PERMANENT
    }
  }

  private sleep(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms))
  }
}

// Default configuration
export const defaultErrorHandlingConfig: ErrorHandlingConfig = {
  retryPolicy: {
    maxAttempts: 3,
    baseDelayMs: 1000, // 1 second
    maxDelayMs: 30000, // 30 seconds
    backoffMultiplier: 2,
    jitterEnabled: true
  },
  circuitBreaker: {
    failureThreshold: 5,
    recoveryTimeoutMs: 60000, // 1 minute
    halfOpenMaxCalls: 3,
    monitoringWindowMs: 300000 // 5 minutes
  },
  rateLimit: {
    maxRequestsPerSecond: 10,
    burstSize: 5,
    enabled: true
  },
  fallback: {
    enabled: true,
    fallbackProviders: ['openai', 'google', 'baidu'],
    gracefulDegradation: true
  },
  monitoring: {
    enableMetrics: true,
    alertOnPatterns: true,
    maxErrorHistorySize: 1000
  }
}