Initial commit of akmon project

This commit is contained in:
2026-01-20 08:04:15 +08:00
commit 77a2bab985
1309 changed files with 343305 additions and 0 deletions

View File

@@ -0,0 +1,806 @@
// AI Content Analysis Service - Content classification, sentiment analysis, and quality assessment
import {
ContentAnalysisResult,
EntityResult,
TopicResult,
CategoryResult,
AIProvider,
AIResponse,
AIServiceConfig,
BatchProcessingOptions,
AIServiceError,
ContentInfo
} from '../types/ai-types.uts'
// 分析类型枚举
type AnalysisType = 'sentiment' | 'entities' | 'topics' | 'categories' | 'readability' | 'credibility' | 'toxicity' | 'summary' | 'keywords'
// 分析选项
type AnalysisOptions = {
types: AnalysisType[]
provider?: AIProvider
model?: string
includeScores?: boolean
detailedResults?: boolean
language?: string
customCategories?: string[]
}
// 内容质量评估结果
type QualityAssessment = {
overallScore: number
factualAccuracy: number
sourceReliability: number
writingQuality: number
objectivity: number
completeness: number
timeliness: number
relevance: number
}
// 关键词提取结果
type KeywordResult = {
keyword: string
frequency: number
importance: number
type: 'noun' | 'verb' | 'adjective' | 'entity' | 'concept'
}
// 分析统计
type AnalysisStats = {
totalAnalyses: number
successCount: number
errorCount: number
avgProcessingTimeMs: number
totalCost: number
byProvider: Record<AIProvider, number>
byAnalysisType: Record<AnalysisType, number>
}
/**
* AI内容分析服务类
* 提供情感分析、实体识别、主题提取、内容分类、质量评估等功能
*/
export class AIContentAnalysisService {
private config: AIServiceConfig
private stats: AnalysisStats = {
totalAnalyses: 0,
successCount: 0,
errorCount: 0,
avgProcessingTimeMs: 0,
totalCost: 0,
byProvider: {} as Record<AIProvider, number>,
byAnalysisType: {} as Record<AnalysisType, number>
}
// 预定义的新闻分类
private readonly NEWS_CATEGORIES = [
{ id: 'politics', name: '政治', keywords: ['政府', '政策', '选举', '法律', '议会', '总统', '部长'] },
{ id: 'economy', name: '经济', keywords: ['经济', '金融', '股市', '投资', '银行', '贸易', 'GDP'] },
{ id: 'technology', name: '科技', keywords: ['科技', '人工智能', '互联网', '软件', '硬件', '创新', '数字化'] },
{ id: 'sports', name: '体育', keywords: ['体育', '足球', '篮球', '奥运', '比赛', '运动员', '锦标赛'] },
{ id: 'entertainment', name: '娱乐', keywords: ['娱乐', '电影', '音乐', '明星', '综艺', '演出', '艺术'] },
{ id: 'health', name: '健康', keywords: ['健康', '医疗', '病毒', '疫苗', '医院', '药物', '疾病'] },
{ id: 'education', name: '教育', keywords: ['教育', '学校', '大学', '学生', '教师', '考试', '学习'] },
{ id: 'environment', name: '环境', keywords: ['环境', '气候', '污染', '环保', '生态', '绿色', '可持续'] },
{ id: 'international', name: '国际', keywords: ['国际', '外交', '战争', '和平', '联合国', '条约', '全球'] },
{ id: 'social', name: '社会', keywords: ['社会', '社区', '公益', '慈善', '志愿者', '文化', '传统'] }
]
constructor(config: AIServiceConfig) {
this.config = config
this.initializeStats()
}
/**
* 分析内容
* @param content 内容文本
* @param options 分析选项
*/
async analyzeContent(
content: string,
options: AnalysisOptions = {
types: ['sentiment', 'entities', 'topics', 'categories', 'readability', 'summary', 'keywords']
}
): Promise<AIResponse<ContentAnalysisResult>> {
try {
this.stats.totalAnalyses++
const startTime = Date.now()
// 选择提供商
const provider = options.provider || this.selectBestProvider()
// 执行各种分析
const results = await Promise.allSettled([
this.analyzeSentiment(content, provider, options),
this.extractEntities(content, provider, options),
this.extractTopics(content, provider, options),
this.classifyContent(content, options),
this.assessReadability(content, options.language),
this.assessCredibility(content),
this.assessToxicity(content, provider),
this.generateSummary(content, provider, options),
this.extractKeywords(content, options)
])
const processingTime = Date.now() - startTime
// 合并结果
const analysisResult: ContentAnalysisResult = {
contentId: this.generateContentId(content),
sentimentScore: this.extractResult(results[0], 0),
sentimentLabel: this.getSentimentLabel(this.extractResult(results[0], 0)),
readabilityScore: this.extractResult(results[4], 0.5),
credibilityScore: this.extractResult(results[5], 0.5),
toxicityScore: this.extractResult(results[6], 0),
keywords: this.extractResult(results[8], []),
entities: this.extractResult(results[1], []),
topics: this.extractResult(results[2], []),
categories: this.extractResult(results[3], []),
summary: this.extractResult(results[7], ''),
keyPhrases: this.extractKeyPhrases(content),
language: options.language || await this.detectLanguage(content),
processingTimeMs: processingTime,
provider
}
// 更新统计
this.updateStats(provider, options.types, processingTime)
this.stats.successCount++
return {
success: true,
data: analysisResult,
processingTimeMs: processingTime,
provider
}
} catch (error) {
this.stats.errorCount++
const aiError: AIServiceError = {
code: 'ANALYSIS_ERROR',
message: error.message || 'Content analysis failed',
provider: options.provider,
retryable: this.isRetryableError(error)
}
return {
success: false,
error: aiError.message,
errorCode: aiError.code
}
}
}
/**
* 批量内容分析
* @param contents 内容数组
* @param options 分析选项
* @param batchOptions 批处理选项
*/
async analyzeContentBatch(
contents: string[],
options: AnalysisOptions = { types: ['sentiment', 'categories', 'summary'] },
batchOptions: BatchProcessingOptions = {
batchSize: 5,
concurrency: 2,
retryCount: 2,
delayMs: 1000
}
): Promise<AIResponse<ContentAnalysisResult[]>> {
try {
const results: ContentAnalysisResult[] = []
const batches = this.createBatches(contents, batchOptions.batchSize)
for (let i = 0; i < batches.length; i++) {
const batch = batches[i]
const batchPromises = batch.map(async (content) => {
try {
const response = await this.analyzeContent(content, options)
if (response.success && response.data) {
return response.data
}
throw new Error(response.error || 'Analysis failed')
} catch (error) {
if (batchOptions.onError) {
batchOptions.onError(error, content)
}
throw error
}
})
const batchResults = await Promise.allSettled(batchPromises)
for (const result of batchResults) {
if (result.status === 'fulfilled') {
results.push(result.value)
}
}
// 进度回调
if (batchOptions.onProgress) {
batchOptions.onProgress(results.length, contents.length)
}
// 批次间延迟
if (i < batches.length - 1 && batchOptions.delayMs > 0) {
await this.delay(batchOptions.delayMs)
}
}
return { success: true, data: results }
} catch (error) {
return {
success: false,
error: error.message || 'Batch analysis failed'
}
}
}
/**
* 质量评估
* @param content 内容文本
* @param metadata 内容元数据
*/
async assessQuality(content: string, metadata?: Partial<ContentInfo>): Promise<AIResponse<QualityAssessment>> {
try {
const [
factualScore,
sourceScore,
writingScore,
objectivityScore,
completenessScore,
timelinessScore,
relevanceScore
] = await Promise.all([
this.assessFactualAccuracy(content),
this.assessSourceReliability(metadata?.sourceUrl || ''),
this.assessWritingQuality(content),
this.assessObjectivity(content),
this.assessCompleteness(content),
this.assessTimeliness(metadata?.publishedAt || Date.now()),
this.assessRelevance(content, metadata?.categoryId)
])
const overallScore = (
factualScore + sourceScore + writingScore + objectivityScore +
completenessScore + timelinessScore + relevanceScore
) / 7
const assessment: QualityAssessment = {
overallScore,
factualAccuracy: factualScore,
sourceReliability: sourceScore,
writingQuality: writingScore,
objectivity: objectivityScore,
completeness: completenessScore,
timeliness: timelinessScore,
relevance: relevanceScore
}
return { success: true, data: assessment }
} catch (error) {
return {
success: false,
error: error.message || 'Quality assessment failed'
}
}
}
/**
* 获取统计信息
*/
getStatistics(): AnalysisStats {
return { ...this.stats }
}
// Private methods
private async analyzeSentiment(content: string, provider: AIProvider, options: AnalysisOptions): Promise<number> {
if (!options.types.includes('sentiment')) return 0
switch (provider) {
case 'openai':
return await this.analyzeSentimentWithOpenAI(content)
case 'google':
return await this.analyzeSentimentWithGoogle(content)
case 'baidu':
return await this.analyzeSentimentWithBaidu(content)
default:
return this.analyzeSentimentBasic(content)
}
}
private async extractEntities(content: string, provider: AIProvider, options: AnalysisOptions): Promise<EntityResult[]> {
if (!options.types.includes('entities')) return []
switch (provider) {
case 'openai':
return await this.extractEntitiesWithOpenAI(content)
case 'google':
return await this.extractEntitiesWithGoogle(content)
default:
return this.extractEntitiesBasic(content)
}
}
private async extractTopics(content: string, provider: AIProvider, options: AnalysisOptions): Promise<TopicResult[]> {
if (!options.types.includes('topics')) return []
switch (provider) {
case 'openai':
return await this.extractTopicsWithOpenAI(content)
default:
return this.extractTopicsBasic(content)
}
}
private async classifyContent(content: string, options: AnalysisOptions): Promise<CategoryResult[]> {
if (!options.types.includes('categories')) return []
const categories: CategoryResult[] = []
// 基于关键词的分类
for (const category of this.NEWS_CATEGORIES) {
const matches = category.keywords.filter(keyword =>
content.toLowerCase().includes(keyword.toLowerCase())
)
if (matches.length > 0) {
const confidence = Math.min(matches.length / category.keywords.length, 1)
categories.push({
categoryId: category.id,
categoryName: category.name,
confidence,
level: 1
})
}
}
// 按置信度排序
return categories.sort((a, b) => b.confidence - a.confidence).slice(0, 3)
}
private assessReadability(content: string, language?: string): number {
// 简化的可读性评估
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0)
const words = content.split(/\s+/).filter(w => w.length > 0)
const characters = content.replace(/\s/g, '').length
if (sentences.length === 0 || words.length === 0) return 0
const avgWordsPerSentence = words.length / sentences.length
const avgCharsPerWord = characters / words.length
// 基于句子长度和词汇复杂度的评分
let score = 1.0
// 句子长度惩罚
if (avgWordsPerSentence > 20) score -= 0.2
if (avgWordsPerSentence > 30) score -= 0.3
// 词汇复杂度惩罚
if (avgCharsPerWord > 6) score -= 0.1
if (avgCharsPerWord > 8) score -= 0.2
return Math.max(0, Math.min(1, score))
}
private assessCredibility(content: string): number {
let score = 0.5 // 基础分
// 包含引用或来源
if (content.includes('据') || content.includes('根据') || content.includes('来源')) {
score += 0.2
}
// 包含具体数据
if (/\d+%|\d+万|\d+亿|\d{4}年/.test(content)) {
score += 0.15
}
// 避免极端词汇
const extremeWords = ['绝对', '必然', '完全', '永远', '从来', '所有']
const extremeCount = extremeWords.filter(word => content.includes(word)).length
score -= extremeCount * 0.05
// 避免情绪化表达
const emotionalWords = ['震惊', '愤怒', '可怕', '惊人', '令人发指']
const emotionalCount = emotionalWords.filter(word => content.includes(word)).length
score -= emotionalCount * 0.03
return Math.max(0, Math.min(1, score))
}
private async assessToxicity(content: string, provider: AIProvider): Promise<number> {
// 基础毒性检测
const toxicWords = ['仇恨', '歧视', '暴力', '威胁', '诽谤', '侮辱']
const toxicCount = toxicWords.filter(word => content.includes(word)).length
return Math.min(toxicCount / 10, 1)
}
private async generateSummary(content: string, provider: AIProvider, options: AnalysisOptions): Promise<string> {
if (!options.types.includes('summary')) return ''
// 简单的摘要生成:提取前两句
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 10)
return sentences.slice(0, 2).join('。') + (sentences.length > 2 ? '。' : '')
}
private extractKeywords(content: string, options: AnalysisOptions): string[] {
if (!options.types.includes('keywords')) return []
// 简单的关键词提取
const words = content
.replace(/[^\u4e00-\u9fa5\w\s]/g, '') // 保留中文、英文和空格
.split(/\s+/)
.filter(word => word.length > 1)
// 统计词频
const wordCount: Record<string, number> = {}
words.forEach(word => {
const lower = word.toLowerCase()
wordCount[lower] = (wordCount[lower] || 0) + 1
})
// 按频率排序并返回前10个
return Object.entries(wordCount)
.sort(([, a], [, b]) => b - a)
.slice(0, 10)
.map(([word]) => word)
}
private extractKeyPhrases(content: string): string[] {
// 提取2-3个词的短语
const phrases: string[] = []
const words = content.split(/\s+/)
for (let i = 0; i < words.length - 1; i++) {
const twoWordPhrase = words.slice(i, i + 2).join(' ')
if (twoWordPhrase.length > 4) {
phrases.push(twoWordPhrase)
}
if (i < words.length - 2) {
const threeWordPhrase = words.slice(i, i + 3).join(' ')
if (threeWordPhrase.length > 6) {
phrases.push(threeWordPhrase)
}
}
}
// 去重并返回前5个
return [...new Set(phrases)].slice(0, 5)
}
private async detectLanguage(content: string): Promise<string> {
// 基础语言检测
const chineseRegex = /[\u4e00-\u9fff]/
const englishRegex = /[a-zA-Z]/
const chineseMatches = content.match(chineseRegex)?.length || 0
const englishMatches = content.match(englishRegex)?.length || 0
if (chineseMatches > englishMatches) return 'zh-CN'
if (englishMatches > chineseMatches) return 'en'
return 'auto'
}
private getSentimentLabel(score: number): 'positive' | 'negative' | 'neutral' {
if (score > 0.1) return 'positive'
if (score < -0.1) return 'negative'
return 'neutral'
}
private generateContentId(content: string): string {
// 简单的内容ID生成
return `content_${Date.now()}_${content.substring(0, 10).replace(/\s/g, '_')}`
}
private extractResult<T>(result: PromiseSettledResult<T>, defaultValue: T): T {
return result.status === 'fulfilled' ? result.value : defaultValue
}
private selectBestProvider(): AIProvider {
if (this.config.openai?.apiKey) return 'openai'
if (this.config.google?.apiKey) return 'google'
if (this.config.baidu?.apiKey) return 'baidu'
return 'openai'
}
private createBatches<T>(items: T[], batchSize: number): T[][] {
const batches: T[][] = []
for (let i = 0; i < items.length; i += batchSize) {
batches.push(items.slice(i, i + batchSize))
}
return batches
}
private async delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms))
}
private initializeStats(): void {
const providers: AIProvider[] = ['openai', 'google', 'baidu', 'custom']
const analysisTypes: AnalysisType[] = ['sentiment', 'entities', 'topics', 'categories', 'readability', 'credibility', 'toxicity', 'summary', 'keywords']
providers.forEach(provider => {
this.stats.byProvider[provider] = 0
})
analysisTypes.forEach(type => {
this.stats.byAnalysisType[type] = 0
})
}
private updateStats(provider: AIProvider, types: AnalysisType[], processingTime: number): void {
this.stats.byProvider[provider]++
types.forEach(type => {
this.stats.byAnalysisType[type]++
})
this.stats.avgProcessingTimeMs = (this.stats.avgProcessingTimeMs * (this.stats.totalAnalyses - 1) + processingTime) / this.stats.totalAnalyses
}
private isRetryableError(error: any): boolean {
const retryableCodes = ['TIMEOUT', 'RATE_LIMIT', 'SERVER_ERROR']
return retryableCodes.includes(error.code) || error.status >= 500
}
// Quality assessment methods
private async assessFactualAccuracy(content: string): Promise<number> {
// 检查是否包含可验证的事实
let score = 0.5
// 包含日期
if (/\d{4}年|\d{1,2}月|\d{1,2}日/.test(content)) score += 0.1
// 包含具体数字
if (/\d+\.?\d*%|\d+万|\d+亿|\d+千/.test(content)) score += 0.1
// 包含地点
if (/市|省|县|区|国|州/.test(content)) score += 0.1
// 包含人名或机构名
if (/先生|女士|部长|主席|公司|集团|大学|医院/.test(content)) score += 0.1
return Math.min(1, score)
}
private async assessSourceReliability(sourceUrl: string): Promise<number> {
if (!sourceUrl) return 0.3
// 简单的源可靠性评估
const reliableDomains = ['gov.cn', 'edu.cn', 'xinhuanet.com', 'people.com.cn', 'cctv.com']
const domain = sourceUrl.toLowerCase()
for (const reliableDomain of reliableDomains) {
if (domain.includes(reliableDomain)) return 0.9
}
if (domain.includes('.gov') || domain.includes('.edu')) return 0.8
if (domain.includes('news') || domain.includes('media')) return 0.6
return 0.4
}
private async assessWritingQuality(content: string): Promise<number> {
let score = 0.5
// 检查语法和结构
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0)
if (sentences.length > 2) score += 0.1
// 检查段落结构
const paragraphs = content.split('\n\n').filter(p => p.trim().length > 0)
if (paragraphs.length > 1) score += 0.1
// 检查词汇丰富度
const words = content.split(/\s+/).filter(w => w.length > 0)
const uniqueWords = new Set(words.map(w => w.toLowerCase()))
const diversity = uniqueWords.size / words.length
score += diversity * 0.3
return Math.min(1, score)
}
private async assessObjectivity(content: string): Promise<number> {
let score = 0.7 // 基础客观性分数
// 主观词汇惩罚
const subjectiveWords = ['我认为', '个人觉得', '显然', '明显', '无疑', '肯定']
const subjectiveCount = subjectiveWords.filter(word => content.includes(word)).length
score -= subjectiveCount * 0.1
// 情感词汇惩罚
const emotionalWords = ['愤怒', '激动', '兴奋', '失望', '震惊', '惊喜']
const emotionalCount = emotionalWords.filter(word => content.includes(word)).length
score -= emotionalCount * 0.05
return Math.max(0, Math.min(1, score))
}
private async assessCompleteness(content: string): Promise<number> {
let score = 0.3
// 基于内容长度
if (content.length > 200) score += 0.2
if (content.length > 500) score += 0.2
if (content.length > 1000) score += 0.2
// 包含关键新闻要素5W1H
const hasWho = /人|者|员|家|国|公司|组织/.test(content)
const hasWhat = /事件|活动|发生|进行|宣布|决定/.test(content)
const hasWhen = /\d{4}年|\d{1,2}月|\d{1,2}日|今天|昨天|明天/.test(content)
const hasWhere = /市|省|县|区|国|地区|地点/.test(content)
const hasWhy = /因为|由于|原因|目的|为了/.test(content)
const elements = [hasWho, hasWhat, hasWhen, hasWhere, hasWhy].filter(Boolean).length
score += elements * 0.06
return Math.min(1, score)
}
private async assessTimeliness(publishedAt: number): Promise<number> {
const now = Date.now()
const ageHours = (now - publishedAt) / (1000 * 60 * 60)
// 新闻越新,时效性越高
if (ageHours < 1) return 1.0
if (ageHours < 6) return 0.9
if (ageHours < 24) return 0.7
if (ageHours < 72) return 0.5
if (ageHours < 168) return 0.3
return 0.1
}
private async assessRelevance(content: string, categoryId?: string): Promise<number> {
if (!categoryId) return 0.5
// 根据分类检查相关性
const category = this.NEWS_CATEGORIES.find(c => c.id === categoryId)
if (!category) return 0.5
const matches = category.keywords.filter(keyword =>
content.toLowerCase().includes(keyword.toLowerCase())
).length
return Math.min(1, matches / category.keywords.length + 0.3)
}
// Mock AI service methods
private async analyzeSentimentWithOpenAI(content: string): Promise<number> {
// 模拟OpenAI情感分析
await this.delay(Math.random() * 500 + 200)
// 简单的情感检测
const positiveWords = ['好', '棒', '优秀', '成功', '胜利', '喜悦', '高兴', '满意']
const negativeWords = ['坏', '糟糕', '失败', '问题', '困难', '悲伤', '愤怒', '失望']
const positiveCount = positiveWords.filter(word => content.includes(word)).length
const negativeCount = negativeWords.filter(word => content.includes(word)).length
const score = (positiveCount - negativeCount) / Math.max(positiveCount + negativeCount, 1)
return Math.max(-1, Math.min(1, score))
}
private async analyzeSentimentWithGoogle(content: string): Promise<number> {
await this.delay(Math.random() * 400 + 150)
return Math.random() * 2 - 1 // -1 to 1
}
private async analyzeSentimentWithBaidu(content: string): Promise<number> {
await this.delay(Math.random() * 300 + 100)
return Math.random() * 2 - 1
}
private analyzeSentimentBasic(content: string): number {
// 基础情感分析
const positiveWords = ['好', '棒', '优秀', '成功', '胜利', 'great', 'good', 'excellent']
const negativeWords = ['坏', '糟糕', '失败', '问题', 'bad', 'terrible', 'awful']
const positiveCount = positiveWords.filter(word => content.toLowerCase().includes(word)).length
const negativeCount = negativeWords.filter(word => content.toLowerCase().includes(word)).length
return (positiveCount - negativeCount) / Math.max(positiveCount + negativeCount, 1)
}
private async extractEntitiesWithOpenAI(content: string): Promise<EntityResult[]> {
await this.delay(Math.random() * 600 + 300)
// 模拟实体提取
const entities: EntityResult[] = []
const patterns = [
{ regex: /[\u4e00-\u9fa5]{2,4}(公司|集团|企业|机构)/g, type: 'organization' as const },
{ regex: /[\u4e00-\u9fa5]{2,3}(市|省|县|区)/g, type: 'location' as const },
{ regex: /[\u4e00-\u9fa5]{2,4}(先生|女士|部长|主席|总裁|经理)/g, type: 'person' as const },
{ regex: /\d{4}年\d{1,2}月\d{1,2}日/g, type: 'date' as const },
{ regex: /\d+\.?\d*(万|亿|千)?(元|美元|英镑)/g, type: 'money' as const }
]
patterns.forEach(pattern => {
const matches = content.matchAll(pattern.regex)
for (const match of matches) {
entities.push({
text: match[0],
type: pattern.type,
confidence: 0.8 + Math.random() * 0.2,
startPosition: match.index || 0,
endPosition: (match.index || 0) + match[0].length
})
}
})
return entities.slice(0, 10)
}
private async extractEntitiesWithGoogle(content: string): Promise<EntityResult[]> {
await this.delay(Math.random() * 500 + 250)
return this.extractEntitiesBasic(content)
}
private extractEntitiesBasic(content: string): EntityResult[] {
// 基础实体提取
const entities: EntityResult[] = []
// 提取组织
const orgMatches = content.matchAll(/[\u4e00-\u9fa5]{2,4}(公司|集团)/g)
for (const match of orgMatches) {
entities.push({
text: match[0],
type: 'organization',
confidence: 0.7,
startPosition: match.index || 0,
endPosition: (match.index || 0) + match[0].length
})
}
return entities
}
private async extractTopicsWithOpenAI(content: string): Promise<TopicResult[]> {
await this.delay(Math.random() * 400 + 200)
// 基于关键词聚类的主题提取
const topics: TopicResult[] = []
for (const category of this.NEWS_CATEGORIES.slice(0, 3)) {
const matches = category.keywords.filter(keyword =>
content.toLowerCase().includes(keyword.toLowerCase())
)
if (matches.length > 0) {
topics.push({
name: category.name,
confidence: matches.length / category.keywords.length,
keywords: matches
})
}
}
return topics.sort((a, b) => b.confidence - a.confidence)
}
private extractTopicsBasic(content: string): TopicResult[] {
// 基础主题提取
const topics: TopicResult[] = []
// 检查科技相关关键词
const techKeywords = ['科技', '技术', '互联网', 'AI', '人工智能']
const techMatches = techKeywords.filter(keyword => content.includes(keyword))
if (techMatches.length > 0) {
topics.push({
name: '科技',
confidence: techMatches.length / techKeywords.length,
keywords: techMatches
})
}
return topics
}
}