// AI Content Analysis Service - Content classification, sentiment analysis, and quality assessment import { ContentAnalysisResult, EntityResult, TopicResult, CategoryResult, AIProvider, AIResponse, AIServiceConfig, BatchProcessingOptions, AIServiceError, ContentInfo } from '../types/ai-types.uts' // 分析类型枚举 type AnalysisType = 'sentiment' | 'entities' | 'topics' | 'categories' | 'readability' | 'credibility' | 'toxicity' | 'summary' | 'keywords' // 分析选项 type AnalysisOptions = { types: AnalysisType[] provider?: AIProvider model?: string includeScores?: boolean detailedResults?: boolean language?: string customCategories?: string[] } // 内容质量评估结果 type QualityAssessment = { overallScore: number factualAccuracy: number sourceReliability: number writingQuality: number objectivity: number completeness: number timeliness: number relevance: number } // 关键词提取结果 type KeywordResult = { keyword: string frequency: number importance: number type: 'noun' | 'verb' | 'adjective' | 'entity' | 'concept' } // 分析统计 type AnalysisStats = { totalAnalyses: number successCount: number errorCount: number avgProcessingTimeMs: number totalCost: number byProvider: Record byAnalysisType: Record } /** * AI内容分析服务类 * 提供情感分析、实体识别、主题提取、内容分类、质量评估等功能 */ export class AIContentAnalysisService { private config: AIServiceConfig private stats: AnalysisStats = { totalAnalyses: 0, successCount: 0, errorCount: 0, avgProcessingTimeMs: 0, totalCost: 0, byProvider: {} as Record, byAnalysisType: {} as Record } // 预定义的新闻分类 private readonly NEWS_CATEGORIES = [ { id: 'politics', name: '政治', keywords: ['政府', '政策', '选举', '法律', '议会', '总统', '部长'] }, { id: 'economy', name: '经济', keywords: ['经济', '金融', '股市', '投资', '银行', '贸易', 'GDP'] }, { id: 'technology', name: '科技', keywords: ['科技', '人工智能', '互联网', '软件', '硬件', '创新', '数字化'] }, { id: 'sports', name: '体育', keywords: ['体育', '足球', '篮球', '奥运', '比赛', '运动员', '锦标赛'] }, { id: 'entertainment', name: '娱乐', keywords: ['娱乐', '电影', '音乐', '明星', '综艺', '演出', '艺术'] }, { id: 'health', name: '健康', keywords: ['健康', '医疗', '病毒', '疫苗', '医院', '药物', '疾病'] }, { id: 'education', name: '教育', keywords: ['教育', '学校', '大学', '学生', '教师', '考试', '学习'] }, { id: 'environment', name: '环境', keywords: ['环境', '气候', '污染', '环保', '生态', '绿色', '可持续'] }, { id: 'international', name: '国际', keywords: ['国际', '外交', '战争', '和平', '联合国', '条约', '全球'] }, { id: 'social', name: '社会', keywords: ['社会', '社区', '公益', '慈善', '志愿者', '文化', '传统'] } ] constructor(config: AIServiceConfig) { this.config = config this.initializeStats() } /** * 分析内容 * @param content 内容文本 * @param options 分析选项 */ async analyzeContent( content: string, options: AnalysisOptions = { types: ['sentiment', 'entities', 'topics', 'categories', 'readability', 'summary', 'keywords'] } ): Promise> { try { this.stats.totalAnalyses++ const startTime = Date.now() // 选择提供商 const provider = options.provider || this.selectBestProvider() // 执行各种分析 const results = await Promise.allSettled([ this.analyzeSentiment(content, provider, options), this.extractEntities(content, provider, options), this.extractTopics(content, provider, options), this.classifyContent(content, options), this.assessReadability(content, options.language), this.assessCredibility(content), this.assessToxicity(content, provider), this.generateSummary(content, provider, options), this.extractKeywords(content, options) ]) const processingTime = Date.now() - startTime // 合并结果 const analysisResult: ContentAnalysisResult = { contentId: this.generateContentId(content), sentimentScore: this.extractResult(results[0], 0), sentimentLabel: this.getSentimentLabel(this.extractResult(results[0], 0)), readabilityScore: this.extractResult(results[4], 0.5), credibilityScore: this.extractResult(results[5], 0.5), toxicityScore: this.extractResult(results[6], 0), keywords: this.extractResult(results[8], []), entities: this.extractResult(results[1], []), topics: this.extractResult(results[2], []), categories: this.extractResult(results[3], []), summary: this.extractResult(results[7], ''), keyPhrases: this.extractKeyPhrases(content), language: options.language || await this.detectLanguage(content), processingTimeMs: processingTime, provider } // 更新统计 this.updateStats(provider, options.types, processingTime) this.stats.successCount++ return { success: true, data: analysisResult, processingTimeMs: processingTime, provider } } catch (error) { this.stats.errorCount++ const aiError: AIServiceError = { code: 'ANALYSIS_ERROR', message: error.message || 'Content analysis failed', provider: options.provider, retryable: this.isRetryableError(error) } return { success: false, error: aiError.message, errorCode: aiError.code } } } /** * 批量内容分析 * @param contents 内容数组 * @param options 分析选项 * @param batchOptions 批处理选项 */ async analyzeContentBatch( contents: string[], options: AnalysisOptions = { types: ['sentiment', 'categories', 'summary'] }, batchOptions: BatchProcessingOptions = { batchSize: 5, concurrency: 2, retryCount: 2, delayMs: 1000 } ): Promise> { try { const results: ContentAnalysisResult[] = [] const batches = this.createBatches(contents, batchOptions.batchSize) for (let i = 0; i < batches.length; i++) { const batch = batches[i] const batchPromises = batch.map(async (content) => { try { const response = await this.analyzeContent(content, options) if (response.success && response.data) { return response.data } throw new Error(response.error || 'Analysis failed') } catch (error) { if (batchOptions.onError) { batchOptions.onError(error, content) } throw error } }) const batchResults = await Promise.allSettled(batchPromises) for (const result of batchResults) { if (result.status === 'fulfilled') { results.push(result.value) } } // 进度回调 if (batchOptions.onProgress) { batchOptions.onProgress(results.length, contents.length) } // 批次间延迟 if (i < batches.length - 1 && batchOptions.delayMs > 0) { await this.delay(batchOptions.delayMs) } } return { success: true, data: results } } catch (error) { return { success: false, error: error.message || 'Batch analysis failed' } } } /** * 质量评估 * @param content 内容文本 * @param metadata 内容元数据 */ async assessQuality(content: string, metadata?: Partial): Promise> { try { const [ factualScore, sourceScore, writingScore, objectivityScore, completenessScore, timelinessScore, relevanceScore ] = await Promise.all([ this.assessFactualAccuracy(content), this.assessSourceReliability(metadata?.sourceUrl || ''), this.assessWritingQuality(content), this.assessObjectivity(content), this.assessCompleteness(content), this.assessTimeliness(metadata?.publishedAt || Date.now()), this.assessRelevance(content, metadata?.categoryId) ]) const overallScore = ( factualScore + sourceScore + writingScore + objectivityScore + completenessScore + timelinessScore + relevanceScore ) / 7 const assessment: QualityAssessment = { overallScore, factualAccuracy: factualScore, sourceReliability: sourceScore, writingQuality: writingScore, objectivity: objectivityScore, completeness: completenessScore, timeliness: timelinessScore, relevance: relevanceScore } return { success: true, data: assessment } } catch (error) { return { success: false, error: error.message || 'Quality assessment failed' } } } /** * 获取统计信息 */ getStatistics(): AnalysisStats { return { ...this.stats } } // Private methods private async analyzeSentiment(content: string, provider: AIProvider, options: AnalysisOptions): Promise { if (!options.types.includes('sentiment')) return 0 switch (provider) { case 'openai': return await this.analyzeSentimentWithOpenAI(content) case 'google': return await this.analyzeSentimentWithGoogle(content) case 'baidu': return await this.analyzeSentimentWithBaidu(content) default: return this.analyzeSentimentBasic(content) } } private async extractEntities(content: string, provider: AIProvider, options: AnalysisOptions): Promise { if (!options.types.includes('entities')) return [] switch (provider) { case 'openai': return await this.extractEntitiesWithOpenAI(content) case 'google': return await this.extractEntitiesWithGoogle(content) default: return this.extractEntitiesBasic(content) } } private async extractTopics(content: string, provider: AIProvider, options: AnalysisOptions): Promise { if (!options.types.includes('topics')) return [] switch (provider) { case 'openai': return await this.extractTopicsWithOpenAI(content) default: return this.extractTopicsBasic(content) } } private async classifyContent(content: string, options: AnalysisOptions): Promise { if (!options.types.includes('categories')) return [] const categories: CategoryResult[] = [] // 基于关键词的分类 for (const category of this.NEWS_CATEGORIES) { const matches = category.keywords.filter(keyword => content.toLowerCase().includes(keyword.toLowerCase()) ) if (matches.length > 0) { const confidence = Math.min(matches.length / category.keywords.length, 1) categories.push({ categoryId: category.id, categoryName: category.name, confidence, level: 1 }) } } // 按置信度排序 return categories.sort((a, b) => b.confidence - a.confidence).slice(0, 3) } private assessReadability(content: string, language?: string): number { // 简化的可读性评估 const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0) const words = content.split(/\s+/).filter(w => w.length > 0) const characters = content.replace(/\s/g, '').length if (sentences.length === 0 || words.length === 0) return 0 const avgWordsPerSentence = words.length / sentences.length const avgCharsPerWord = characters / words.length // 基于句子长度和词汇复杂度的评分 let score = 1.0 // 句子长度惩罚 if (avgWordsPerSentence > 20) score -= 0.2 if (avgWordsPerSentence > 30) score -= 0.3 // 词汇复杂度惩罚 if (avgCharsPerWord > 6) score -= 0.1 if (avgCharsPerWord > 8) score -= 0.2 return Math.max(0, Math.min(1, score)) } private assessCredibility(content: string): number { let score = 0.5 // 基础分 // 包含引用或来源 if (content.includes('据') || content.includes('根据') || content.includes('来源')) { score += 0.2 } // 包含具体数据 if (/\d+%|\d+万|\d+亿|\d{4}年/.test(content)) { score += 0.15 } // 避免极端词汇 const extremeWords = ['绝对', '必然', '完全', '永远', '从来', '所有'] const extremeCount = extremeWords.filter(word => content.includes(word)).length score -= extremeCount * 0.05 // 避免情绪化表达 const emotionalWords = ['震惊', '愤怒', '可怕', '惊人', '令人发指'] const emotionalCount = emotionalWords.filter(word => content.includes(word)).length score -= emotionalCount * 0.03 return Math.max(0, Math.min(1, score)) } private async assessToxicity(content: string, provider: AIProvider): Promise { // 基础毒性检测 const toxicWords = ['仇恨', '歧视', '暴力', '威胁', '诽谤', '侮辱'] const toxicCount = toxicWords.filter(word => content.includes(word)).length return Math.min(toxicCount / 10, 1) } private async generateSummary(content: string, provider: AIProvider, options: AnalysisOptions): Promise { if (!options.types.includes('summary')) return '' // 简单的摘要生成:提取前两句 const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 10) return sentences.slice(0, 2).join('。') + (sentences.length > 2 ? '。' : '') } private extractKeywords(content: string, options: AnalysisOptions): string[] { if (!options.types.includes('keywords')) return [] // 简单的关键词提取 const words = content .replace(/[^\u4e00-\u9fa5\w\s]/g, '') // 保留中文、英文和空格 .split(/\s+/) .filter(word => word.length > 1) // 统计词频 const wordCount: Record = {} words.forEach(word => { const lower = word.toLowerCase() wordCount[lower] = (wordCount[lower] || 0) + 1 }) // 按频率排序并返回前10个 return Object.entries(wordCount) .sort(([, a], [, b]) => b - a) .slice(0, 10) .map(([word]) => word) } private extractKeyPhrases(content: string): string[] { // 提取2-3个词的短语 const phrases: string[] = [] const words = content.split(/\s+/) for (let i = 0; i < words.length - 1; i++) { const twoWordPhrase = words.slice(i, i + 2).join(' ') if (twoWordPhrase.length > 4) { phrases.push(twoWordPhrase) } if (i < words.length - 2) { const threeWordPhrase = words.slice(i, i + 3).join(' ') if (threeWordPhrase.length > 6) { phrases.push(threeWordPhrase) } } } // 去重并返回前5个 return [...new Set(phrases)].slice(0, 5) } private async detectLanguage(content: string): Promise { // 基础语言检测 const chineseRegex = /[\u4e00-\u9fff]/ const englishRegex = /[a-zA-Z]/ const chineseMatches = content.match(chineseRegex)?.length || 0 const englishMatches = content.match(englishRegex)?.length || 0 if (chineseMatches > englishMatches) return 'zh-CN' if (englishMatches > chineseMatches) return 'en' return 'auto' } private getSentimentLabel(score: number): 'positive' | 'negative' | 'neutral' { if (score > 0.1) return 'positive' if (score < -0.1) return 'negative' return 'neutral' } private generateContentId(content: string): string { // 简单的内容ID生成 return `content_${Date.now()}_${content.substring(0, 10).replace(/\s/g, '_')}` } private extractResult(result: PromiseSettledResult, defaultValue: T): T { return result.status === 'fulfilled' ? result.value : defaultValue } private selectBestProvider(): AIProvider { if (this.config.openai?.apiKey) return 'openai' if (this.config.google?.apiKey) return 'google' if (this.config.baidu?.apiKey) return 'baidu' return 'openai' } private createBatches(items: T[], batchSize: number): T[][] { const batches: T[][] = [] for (let i = 0; i < items.length; i += batchSize) { batches.push(items.slice(i, i + batchSize)) } return batches } private async delay(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)) } private initializeStats(): void { const providers: AIProvider[] = ['openai', 'google', 'baidu', 'custom'] const analysisTypes: AnalysisType[] = ['sentiment', 'entities', 'topics', 'categories', 'readability', 'credibility', 'toxicity', 'summary', 'keywords'] providers.forEach(provider => { this.stats.byProvider[provider] = 0 }) analysisTypes.forEach(type => { this.stats.byAnalysisType[type] = 0 }) } private updateStats(provider: AIProvider, types: AnalysisType[], processingTime: number): void { this.stats.byProvider[provider]++ types.forEach(type => { this.stats.byAnalysisType[type]++ }) this.stats.avgProcessingTimeMs = (this.stats.avgProcessingTimeMs * (this.stats.totalAnalyses - 1) + processingTime) / this.stats.totalAnalyses } private isRetryableError(error: any): boolean { const retryableCodes = ['TIMEOUT', 'RATE_LIMIT', 'SERVER_ERROR'] return retryableCodes.includes(error.code) || error.status >= 500 } // Quality assessment methods private async assessFactualAccuracy(content: string): Promise { // 检查是否包含可验证的事实 let score = 0.5 // 包含日期 if (/\d{4}年|\d{1,2}月|\d{1,2}日/.test(content)) score += 0.1 // 包含具体数字 if (/\d+\.?\d*%|\d+万|\d+亿|\d+千/.test(content)) score += 0.1 // 包含地点 if (/市|省|县|区|国|州/.test(content)) score += 0.1 // 包含人名或机构名 if (/先生|女士|部长|主席|公司|集团|大学|医院/.test(content)) score += 0.1 return Math.min(1, score) } private async assessSourceReliability(sourceUrl: string): Promise { if (!sourceUrl) return 0.3 // 简单的源可靠性评估 const reliableDomains = ['gov.cn', 'edu.cn', 'xinhuanet.com', 'people.com.cn', 'cctv.com'] const domain = sourceUrl.toLowerCase() for (const reliableDomain of reliableDomains) { if (domain.includes(reliableDomain)) return 0.9 } if (domain.includes('.gov') || domain.includes('.edu')) return 0.8 if (domain.includes('news') || domain.includes('media')) return 0.6 return 0.4 } private async assessWritingQuality(content: string): Promise { let score = 0.5 // 检查语法和结构 const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0) if (sentences.length > 2) score += 0.1 // 检查段落结构 const paragraphs = content.split('\n\n').filter(p => p.trim().length > 0) if (paragraphs.length > 1) score += 0.1 // 检查词汇丰富度 const words = content.split(/\s+/).filter(w => w.length > 0) const uniqueWords = new Set(words.map(w => w.toLowerCase())) const diversity = uniqueWords.size / words.length score += diversity * 0.3 return Math.min(1, score) } private async assessObjectivity(content: string): Promise { let score = 0.7 // 基础客观性分数 // 主观词汇惩罚 const subjectiveWords = ['我认为', '个人觉得', '显然', '明显', '无疑', '肯定'] const subjectiveCount = subjectiveWords.filter(word => content.includes(word)).length score -= subjectiveCount * 0.1 // 情感词汇惩罚 const emotionalWords = ['愤怒', '激动', '兴奋', '失望', '震惊', '惊喜'] const emotionalCount = emotionalWords.filter(word => content.includes(word)).length score -= emotionalCount * 0.05 return Math.max(0, Math.min(1, score)) } private async assessCompleteness(content: string): Promise { let score = 0.3 // 基于内容长度 if (content.length > 200) score += 0.2 if (content.length > 500) score += 0.2 if (content.length > 1000) score += 0.2 // 包含关键新闻要素(5W1H) const hasWho = /人|者|员|家|国|公司|组织/.test(content) const hasWhat = /事件|活动|发生|进行|宣布|决定/.test(content) const hasWhen = /\d{4}年|\d{1,2}月|\d{1,2}日|今天|昨天|明天/.test(content) const hasWhere = /市|省|县|区|国|地区|地点/.test(content) const hasWhy = /因为|由于|原因|目的|为了/.test(content) const elements = [hasWho, hasWhat, hasWhen, hasWhere, hasWhy].filter(Boolean).length score += elements * 0.06 return Math.min(1, score) } private async assessTimeliness(publishedAt: number): Promise { const now = Date.now() const ageHours = (now - publishedAt) / (1000 * 60 * 60) // 新闻越新,时效性越高 if (ageHours < 1) return 1.0 if (ageHours < 6) return 0.9 if (ageHours < 24) return 0.7 if (ageHours < 72) return 0.5 if (ageHours < 168) return 0.3 return 0.1 } private async assessRelevance(content: string, categoryId?: string): Promise { if (!categoryId) return 0.5 // 根据分类检查相关性 const category = this.NEWS_CATEGORIES.find(c => c.id === categoryId) if (!category) return 0.5 const matches = category.keywords.filter(keyword => content.toLowerCase().includes(keyword.toLowerCase()) ).length return Math.min(1, matches / category.keywords.length + 0.3) } // Mock AI service methods private async analyzeSentimentWithOpenAI(content: string): Promise { // 模拟OpenAI情感分析 await this.delay(Math.random() * 500 + 200) // 简单的情感检测 const positiveWords = ['好', '棒', '优秀', '成功', '胜利', '喜悦', '高兴', '满意'] const negativeWords = ['坏', '糟糕', '失败', '问题', '困难', '悲伤', '愤怒', '失望'] const positiveCount = positiveWords.filter(word => content.includes(word)).length const negativeCount = negativeWords.filter(word => content.includes(word)).length const score = (positiveCount - negativeCount) / Math.max(positiveCount + negativeCount, 1) return Math.max(-1, Math.min(1, score)) } private async analyzeSentimentWithGoogle(content: string): Promise { await this.delay(Math.random() * 400 + 150) return Math.random() * 2 - 1 // -1 to 1 } private async analyzeSentimentWithBaidu(content: string): Promise { await this.delay(Math.random() * 300 + 100) return Math.random() * 2 - 1 } private analyzeSentimentBasic(content: string): number { // 基础情感分析 const positiveWords = ['好', '棒', '优秀', '成功', '胜利', 'great', 'good', 'excellent'] const negativeWords = ['坏', '糟糕', '失败', '问题', 'bad', 'terrible', 'awful'] const positiveCount = positiveWords.filter(word => content.toLowerCase().includes(word)).length const negativeCount = negativeWords.filter(word => content.toLowerCase().includes(word)).length return (positiveCount - negativeCount) / Math.max(positiveCount + negativeCount, 1) } private async extractEntitiesWithOpenAI(content: string): Promise { await this.delay(Math.random() * 600 + 300) // 模拟实体提取 const entities: EntityResult[] = [] const patterns = [ { regex: /[\u4e00-\u9fa5]{2,4}(公司|集团|企业|机构)/g, type: 'organization' as const }, { regex: /[\u4e00-\u9fa5]{2,3}(市|省|县|区)/g, type: 'location' as const }, { regex: /[\u4e00-\u9fa5]{2,4}(先生|女士|部长|主席|总裁|经理)/g, type: 'person' as const }, { regex: /\d{4}年\d{1,2}月\d{1,2}日/g, type: 'date' as const }, { regex: /\d+\.?\d*(万|亿|千)?(元|美元|英镑)/g, type: 'money' as const } ] patterns.forEach(pattern => { const matches = content.matchAll(pattern.regex) for (const match of matches) { entities.push({ text: match[0], type: pattern.type, confidence: 0.8 + Math.random() * 0.2, startPosition: match.index || 0, endPosition: (match.index || 0) + match[0].length }) } }) return entities.slice(0, 10) } private async extractEntitiesWithGoogle(content: string): Promise { await this.delay(Math.random() * 500 + 250) return this.extractEntitiesBasic(content) } private extractEntitiesBasic(content: string): EntityResult[] { // 基础实体提取 const entities: EntityResult[] = [] // 提取组织 const orgMatches = content.matchAll(/[\u4e00-\u9fa5]{2,4}(公司|集团)/g) for (const match of orgMatches) { entities.push({ text: match[0], type: 'organization', confidence: 0.7, startPosition: match.index || 0, endPosition: (match.index || 0) + match[0].length }) } return entities } private async extractTopicsWithOpenAI(content: string): Promise { await this.delay(Math.random() * 400 + 200) // 基于关键词聚类的主题提取 const topics: TopicResult[] = [] for (const category of this.NEWS_CATEGORIES.slice(0, 3)) { const matches = category.keywords.filter(keyword => content.toLowerCase().includes(keyword.toLowerCase()) ) if (matches.length > 0) { topics.push({ name: category.name, confidence: matches.length / category.keywords.length, keywords: matches }) } } return topics.sort((a, b) => b.confidence - a.confidence) } private extractTopicsBasic(content: string): TopicResult[] { // 基础主题提取 const topics: TopicResult[] = [] // 检查科技相关关键词 const techKeywords = ['科技', '技术', '互联网', 'AI', '人工智能'] const techMatches = techKeywords.filter(keyword => content.includes(keyword)) if (techMatches.length > 0) { topics.push({ name: '科技', confidence: techMatches.length / techKeywords.length, keywords: techMatches }) } return topics } }