807 lines
26 KiB
Plaintext
807 lines
26 KiB
Plaintext
// AI Content Analysis Service - Content classification, sentiment analysis, and quality assessment
|
||
|
||
import {
|
||
ContentAnalysisResult,
|
||
EntityResult,
|
||
TopicResult,
|
||
CategoryResult,
|
||
AIProvider,
|
||
AIResponse,
|
||
AIServiceConfig,
|
||
BatchProcessingOptions,
|
||
AIServiceError,
|
||
ContentInfo
|
||
} from '../types/ai-types.uts'
|
||
|
||
// 分析类型枚举
|
||
type AnalysisType = 'sentiment' | 'entities' | 'topics' | 'categories' | 'readability' | 'credibility' | 'toxicity' | 'summary' | 'keywords'
|
||
|
||
// 分析选项
|
||
type AnalysisOptions = {
|
||
types: AnalysisType[]
|
||
provider?: AIProvider
|
||
model?: string
|
||
includeScores?: boolean
|
||
detailedResults?: boolean
|
||
language?: string
|
||
customCategories?: string[]
|
||
}
|
||
|
||
// 内容质量评估结果
|
||
type QualityAssessment = {
|
||
overallScore: number
|
||
factualAccuracy: number
|
||
sourceReliability: number
|
||
writingQuality: number
|
||
objectivity: number
|
||
completeness: number
|
||
timeliness: number
|
||
relevance: number
|
||
}
|
||
|
||
// 关键词提取结果
|
||
type KeywordResult = {
|
||
keyword: string
|
||
frequency: number
|
||
importance: number
|
||
type: 'noun' | 'verb' | 'adjective' | 'entity' | 'concept'
|
||
}
|
||
|
||
// 分析统计
|
||
type AnalysisStats = {
|
||
totalAnalyses: number
|
||
successCount: number
|
||
errorCount: number
|
||
avgProcessingTimeMs: number
|
||
totalCost: number
|
||
byProvider: Record<AIProvider, number>
|
||
byAnalysisType: Record<AnalysisType, number>
|
||
}
|
||
|
||
/**
|
||
* AI内容分析服务类
|
||
* 提供情感分析、实体识别、主题提取、内容分类、质量评估等功能
|
||
*/
|
||
export class AIContentAnalysisService {
|
||
private config: AIServiceConfig
|
||
private stats: AnalysisStats = {
|
||
totalAnalyses: 0,
|
||
successCount: 0,
|
||
errorCount: 0,
|
||
avgProcessingTimeMs: 0,
|
||
totalCost: 0,
|
||
byProvider: {} as Record<AIProvider, number>,
|
||
byAnalysisType: {} as Record<AnalysisType, number>
|
||
}
|
||
|
||
// 预定义的新闻分类
|
||
private readonly NEWS_CATEGORIES = [
|
||
{ id: 'politics', name: '政治', keywords: ['政府', '政策', '选举', '法律', '议会', '总统', '部长'] },
|
||
{ id: 'economy', name: '经济', keywords: ['经济', '金融', '股市', '投资', '银行', '贸易', 'GDP'] },
|
||
{ id: 'technology', name: '科技', keywords: ['科技', '人工智能', '互联网', '软件', '硬件', '创新', '数字化'] },
|
||
{ id: 'sports', name: '体育', keywords: ['体育', '足球', '篮球', '奥运', '比赛', '运动员', '锦标赛'] },
|
||
{ id: 'entertainment', name: '娱乐', keywords: ['娱乐', '电影', '音乐', '明星', '综艺', '演出', '艺术'] },
|
||
{ id: 'health', name: '健康', keywords: ['健康', '医疗', '病毒', '疫苗', '医院', '药物', '疾病'] },
|
||
{ id: 'education', name: '教育', keywords: ['教育', '学校', '大学', '学生', '教师', '考试', '学习'] },
|
||
{ id: 'environment', name: '环境', keywords: ['环境', '气候', '污染', '环保', '生态', '绿色', '可持续'] },
|
||
{ id: 'international', name: '国际', keywords: ['国际', '外交', '战争', '和平', '联合国', '条约', '全球'] },
|
||
{ id: 'social', name: '社会', keywords: ['社会', '社区', '公益', '慈善', '志愿者', '文化', '传统'] }
|
||
]
|
||
|
||
constructor(config: AIServiceConfig) {
|
||
this.config = config
|
||
this.initializeStats()
|
||
}
|
||
|
||
/**
|
||
* 分析内容
|
||
* @param content 内容文本
|
||
* @param options 分析选项
|
||
*/
|
||
async analyzeContent(
|
||
content: string,
|
||
options: AnalysisOptions = {
|
||
types: ['sentiment', 'entities', 'topics', 'categories', 'readability', 'summary', 'keywords']
|
||
}
|
||
): Promise<AIResponse<ContentAnalysisResult>> {
|
||
try {
|
||
this.stats.totalAnalyses++
|
||
const startTime = Date.now()
|
||
|
||
// 选择提供商
|
||
const provider = options.provider || this.selectBestProvider()
|
||
|
||
// 执行各种分析
|
||
const results = await Promise.allSettled([
|
||
this.analyzeSentiment(content, provider, options),
|
||
this.extractEntities(content, provider, options),
|
||
this.extractTopics(content, provider, options),
|
||
this.classifyContent(content, options),
|
||
this.assessReadability(content, options.language),
|
||
this.assessCredibility(content),
|
||
this.assessToxicity(content, provider),
|
||
this.generateSummary(content, provider, options),
|
||
this.extractKeywords(content, options)
|
||
])
|
||
|
||
const processingTime = Date.now() - startTime
|
||
|
||
// 合并结果
|
||
const analysisResult: ContentAnalysisResult = {
|
||
contentId: this.generateContentId(content),
|
||
sentimentScore: this.extractResult(results[0], 0),
|
||
sentimentLabel: this.getSentimentLabel(this.extractResult(results[0], 0)),
|
||
readabilityScore: this.extractResult(results[4], 0.5),
|
||
credibilityScore: this.extractResult(results[5], 0.5),
|
||
toxicityScore: this.extractResult(results[6], 0),
|
||
keywords: this.extractResult(results[8], []),
|
||
entities: this.extractResult(results[1], []),
|
||
topics: this.extractResult(results[2], []),
|
||
categories: this.extractResult(results[3], []),
|
||
summary: this.extractResult(results[7], ''),
|
||
keyPhrases: this.extractKeyPhrases(content),
|
||
language: options.language || await this.detectLanguage(content),
|
||
processingTimeMs: processingTime,
|
||
provider
|
||
}
|
||
|
||
// 更新统计
|
||
this.updateStats(provider, options.types, processingTime)
|
||
this.stats.successCount++
|
||
|
||
return {
|
||
success: true,
|
||
data: analysisResult,
|
||
processingTimeMs: processingTime,
|
||
provider
|
||
}
|
||
|
||
} catch (error) {
|
||
this.stats.errorCount++
|
||
const aiError: AIServiceError = {
|
||
code: 'ANALYSIS_ERROR',
|
||
message: error.message || 'Content analysis failed',
|
||
provider: options.provider,
|
||
retryable: this.isRetryableError(error)
|
||
}
|
||
|
||
return {
|
||
success: false,
|
||
error: aiError.message,
|
||
errorCode: aiError.code
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 批量内容分析
|
||
* @param contents 内容数组
|
||
* @param options 分析选项
|
||
* @param batchOptions 批处理选项
|
||
*/
|
||
async analyzeContentBatch(
|
||
contents: string[],
|
||
options: AnalysisOptions = { types: ['sentiment', 'categories', 'summary'] },
|
||
batchOptions: BatchProcessingOptions = {
|
||
batchSize: 5,
|
||
concurrency: 2,
|
||
retryCount: 2,
|
||
delayMs: 1000
|
||
}
|
||
): Promise<AIResponse<ContentAnalysisResult[]>> {
|
||
try {
|
||
const results: ContentAnalysisResult[] = []
|
||
const batches = this.createBatches(contents, batchOptions.batchSize)
|
||
|
||
for (let i = 0; i < batches.length; i++) {
|
||
const batch = batches[i]
|
||
const batchPromises = batch.map(async (content) => {
|
||
try {
|
||
const response = await this.analyzeContent(content, options)
|
||
if (response.success && response.data) {
|
||
return response.data
|
||
}
|
||
throw new Error(response.error || 'Analysis failed')
|
||
} catch (error) {
|
||
if (batchOptions.onError) {
|
||
batchOptions.onError(error, content)
|
||
}
|
||
throw error
|
||
}
|
||
})
|
||
|
||
const batchResults = await Promise.allSettled(batchPromises)
|
||
|
||
for (const result of batchResults) {
|
||
if (result.status === 'fulfilled') {
|
||
results.push(result.value)
|
||
}
|
||
}
|
||
|
||
// 进度回调
|
||
if (batchOptions.onProgress) {
|
||
batchOptions.onProgress(results.length, contents.length)
|
||
}
|
||
|
||
// 批次间延迟
|
||
if (i < batches.length - 1 && batchOptions.delayMs > 0) {
|
||
await this.delay(batchOptions.delayMs)
|
||
}
|
||
}
|
||
|
||
return { success: true, data: results }
|
||
|
||
} catch (error) {
|
||
return {
|
||
success: false,
|
||
error: error.message || 'Batch analysis failed'
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 质量评估
|
||
* @param content 内容文本
|
||
* @param metadata 内容元数据
|
||
*/
|
||
async assessQuality(content: string, metadata?: Partial<ContentInfo>): Promise<AIResponse<QualityAssessment>> {
|
||
try {
|
||
const [
|
||
factualScore,
|
||
sourceScore,
|
||
writingScore,
|
||
objectivityScore,
|
||
completenessScore,
|
||
timelinessScore,
|
||
relevanceScore
|
||
] = await Promise.all([
|
||
this.assessFactualAccuracy(content),
|
||
this.assessSourceReliability(metadata?.sourceUrl || ''),
|
||
this.assessWritingQuality(content),
|
||
this.assessObjectivity(content),
|
||
this.assessCompleteness(content),
|
||
this.assessTimeliness(metadata?.publishedAt || Date.now()),
|
||
this.assessRelevance(content, metadata?.categoryId)
|
||
])
|
||
|
||
const overallScore = (
|
||
factualScore + sourceScore + writingScore + objectivityScore +
|
||
completenessScore + timelinessScore + relevanceScore
|
||
) / 7
|
||
|
||
const assessment: QualityAssessment = {
|
||
overallScore,
|
||
factualAccuracy: factualScore,
|
||
sourceReliability: sourceScore,
|
||
writingQuality: writingScore,
|
||
objectivity: objectivityScore,
|
||
completeness: completenessScore,
|
||
timeliness: timelinessScore,
|
||
relevance: relevanceScore
|
||
}
|
||
|
||
return { success: true, data: assessment }
|
||
|
||
} catch (error) {
|
||
return {
|
||
success: false,
|
||
error: error.message || 'Quality assessment failed'
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 获取统计信息
|
||
*/
|
||
getStatistics(): AnalysisStats {
|
||
return { ...this.stats }
|
||
}
|
||
|
||
// Private methods
|
||
|
||
private async analyzeSentiment(content: string, provider: AIProvider, options: AnalysisOptions): Promise<number> {
|
||
if (!options.types.includes('sentiment')) return 0
|
||
|
||
switch (provider) {
|
||
case 'openai':
|
||
return await this.analyzeSentimentWithOpenAI(content)
|
||
case 'google':
|
||
return await this.analyzeSentimentWithGoogle(content)
|
||
case 'baidu':
|
||
return await this.analyzeSentimentWithBaidu(content)
|
||
default:
|
||
return this.analyzeSentimentBasic(content)
|
||
}
|
||
}
|
||
|
||
private async extractEntities(content: string, provider: AIProvider, options: AnalysisOptions): Promise<EntityResult[]> {
|
||
if (!options.types.includes('entities')) return []
|
||
|
||
switch (provider) {
|
||
case 'openai':
|
||
return await this.extractEntitiesWithOpenAI(content)
|
||
case 'google':
|
||
return await this.extractEntitiesWithGoogle(content)
|
||
default:
|
||
return this.extractEntitiesBasic(content)
|
||
}
|
||
}
|
||
|
||
private async extractTopics(content: string, provider: AIProvider, options: AnalysisOptions): Promise<TopicResult[]> {
|
||
if (!options.types.includes('topics')) return []
|
||
|
||
switch (provider) {
|
||
case 'openai':
|
||
return await this.extractTopicsWithOpenAI(content)
|
||
default:
|
||
return this.extractTopicsBasic(content)
|
||
}
|
||
}
|
||
|
||
private async classifyContent(content: string, options: AnalysisOptions): Promise<CategoryResult[]> {
|
||
if (!options.types.includes('categories')) return []
|
||
|
||
const categories: CategoryResult[] = []
|
||
|
||
// 基于关键词的分类
|
||
for (const category of this.NEWS_CATEGORIES) {
|
||
const matches = category.keywords.filter(keyword =>
|
||
content.toLowerCase().includes(keyword.toLowerCase())
|
||
)
|
||
|
||
if (matches.length > 0) {
|
||
const confidence = Math.min(matches.length / category.keywords.length, 1)
|
||
categories.push({
|
||
categoryId: category.id,
|
||
categoryName: category.name,
|
||
confidence,
|
||
level: 1
|
||
})
|
||
}
|
||
}
|
||
|
||
// 按置信度排序
|
||
return categories.sort((a, b) => b.confidence - a.confidence).slice(0, 3)
|
||
}
|
||
|
||
private assessReadability(content: string, language?: string): number {
|
||
// 简化的可读性评估
|
||
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0)
|
||
const words = content.split(/\s+/).filter(w => w.length > 0)
|
||
const characters = content.replace(/\s/g, '').length
|
||
|
||
if (sentences.length === 0 || words.length === 0) return 0
|
||
|
||
const avgWordsPerSentence = words.length / sentences.length
|
||
const avgCharsPerWord = characters / words.length
|
||
|
||
// 基于句子长度和词汇复杂度的评分
|
||
let score = 1.0
|
||
|
||
// 句子长度惩罚
|
||
if (avgWordsPerSentence > 20) score -= 0.2
|
||
if (avgWordsPerSentence > 30) score -= 0.3
|
||
|
||
// 词汇复杂度惩罚
|
||
if (avgCharsPerWord > 6) score -= 0.1
|
||
if (avgCharsPerWord > 8) score -= 0.2
|
||
|
||
return Math.max(0, Math.min(1, score))
|
||
}
|
||
|
||
private assessCredibility(content: string): number {
|
||
let score = 0.5 // 基础分
|
||
|
||
// 包含引用或来源
|
||
if (content.includes('据') || content.includes('根据') || content.includes('来源')) {
|
||
score += 0.2
|
||
}
|
||
|
||
// 包含具体数据
|
||
if (/\d+%|\d+万|\d+亿|\d{4}年/.test(content)) {
|
||
score += 0.15
|
||
}
|
||
|
||
// 避免极端词汇
|
||
const extremeWords = ['绝对', '必然', '完全', '永远', '从来', '所有']
|
||
const extremeCount = extremeWords.filter(word => content.includes(word)).length
|
||
score -= extremeCount * 0.05
|
||
|
||
// 避免情绪化表达
|
||
const emotionalWords = ['震惊', '愤怒', '可怕', '惊人', '令人发指']
|
||
const emotionalCount = emotionalWords.filter(word => content.includes(word)).length
|
||
score -= emotionalCount * 0.03
|
||
|
||
return Math.max(0, Math.min(1, score))
|
||
}
|
||
|
||
private async assessToxicity(content: string, provider: AIProvider): Promise<number> {
|
||
// 基础毒性检测
|
||
const toxicWords = ['仇恨', '歧视', '暴力', '威胁', '诽谤', '侮辱']
|
||
const toxicCount = toxicWords.filter(word => content.includes(word)).length
|
||
|
||
return Math.min(toxicCount / 10, 1)
|
||
}
|
||
|
||
private async generateSummary(content: string, provider: AIProvider, options: AnalysisOptions): Promise<string> {
|
||
if (!options.types.includes('summary')) return ''
|
||
|
||
// 简单的摘要生成:提取前两句
|
||
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 10)
|
||
return sentences.slice(0, 2).join('。') + (sentences.length > 2 ? '。' : '')
|
||
}
|
||
|
||
private extractKeywords(content: string, options: AnalysisOptions): string[] {
|
||
if (!options.types.includes('keywords')) return []
|
||
|
||
// 简单的关键词提取
|
||
const words = content
|
||
.replace(/[^\u4e00-\u9fa5\w\s]/g, '') // 保留中文、英文和空格
|
||
.split(/\s+/)
|
||
.filter(word => word.length > 1)
|
||
|
||
// 统计词频
|
||
const wordCount: Record<string, number> = {}
|
||
words.forEach(word => {
|
||
const lower = word.toLowerCase()
|
||
wordCount[lower] = (wordCount[lower] || 0) + 1
|
||
})
|
||
|
||
// 按频率排序并返回前10个
|
||
return Object.entries(wordCount)
|
||
.sort(([, a], [, b]) => b - a)
|
||
.slice(0, 10)
|
||
.map(([word]) => word)
|
||
}
|
||
|
||
private extractKeyPhrases(content: string): string[] {
|
||
// 提取2-3个词的短语
|
||
const phrases: string[] = []
|
||
const words = content.split(/\s+/)
|
||
|
||
for (let i = 0; i < words.length - 1; i++) {
|
||
const twoWordPhrase = words.slice(i, i + 2).join(' ')
|
||
if (twoWordPhrase.length > 4) {
|
||
phrases.push(twoWordPhrase)
|
||
}
|
||
|
||
if (i < words.length - 2) {
|
||
const threeWordPhrase = words.slice(i, i + 3).join(' ')
|
||
if (threeWordPhrase.length > 6) {
|
||
phrases.push(threeWordPhrase)
|
||
}
|
||
}
|
||
}
|
||
|
||
// 去重并返回前5个
|
||
return [...new Set(phrases)].slice(0, 5)
|
||
}
|
||
|
||
private async detectLanguage(content: string): Promise<string> {
|
||
// 基础语言检测
|
||
const chineseRegex = /[\u4e00-\u9fff]/
|
||
const englishRegex = /[a-zA-Z]/
|
||
|
||
const chineseMatches = content.match(chineseRegex)?.length || 0
|
||
const englishMatches = content.match(englishRegex)?.length || 0
|
||
|
||
if (chineseMatches > englishMatches) return 'zh-CN'
|
||
if (englishMatches > chineseMatches) return 'en'
|
||
return 'auto'
|
||
}
|
||
|
||
private getSentimentLabel(score: number): 'positive' | 'negative' | 'neutral' {
|
||
if (score > 0.1) return 'positive'
|
||
if (score < -0.1) return 'negative'
|
||
return 'neutral'
|
||
}
|
||
|
||
private generateContentId(content: string): string {
|
||
// 简单的内容ID生成
|
||
return `content_${Date.now()}_${content.substring(0, 10).replace(/\s/g, '_')}`
|
||
}
|
||
|
||
private extractResult<T>(result: PromiseSettledResult<T>, defaultValue: T): T {
|
||
return result.status === 'fulfilled' ? result.value : defaultValue
|
||
}
|
||
|
||
private selectBestProvider(): AIProvider {
|
||
if (this.config.openai?.apiKey) return 'openai'
|
||
if (this.config.google?.apiKey) return 'google'
|
||
if (this.config.baidu?.apiKey) return 'baidu'
|
||
return 'openai'
|
||
}
|
||
|
||
private createBatches<T>(items: T[], batchSize: number): T[][] {
|
||
const batches: T[][] = []
|
||
for (let i = 0; i < items.length; i += batchSize) {
|
||
batches.push(items.slice(i, i + batchSize))
|
||
}
|
||
return batches
|
||
}
|
||
|
||
private async delay(ms: number): Promise<void> {
|
||
return new Promise(resolve => setTimeout(resolve, ms))
|
||
}
|
||
|
||
private initializeStats(): void {
|
||
const providers: AIProvider[] = ['openai', 'google', 'baidu', 'custom']
|
||
const analysisTypes: AnalysisType[] = ['sentiment', 'entities', 'topics', 'categories', 'readability', 'credibility', 'toxicity', 'summary', 'keywords']
|
||
|
||
providers.forEach(provider => {
|
||
this.stats.byProvider[provider] = 0
|
||
})
|
||
|
||
analysisTypes.forEach(type => {
|
||
this.stats.byAnalysisType[type] = 0
|
||
})
|
||
}
|
||
|
||
private updateStats(provider: AIProvider, types: AnalysisType[], processingTime: number): void {
|
||
this.stats.byProvider[provider]++
|
||
types.forEach(type => {
|
||
this.stats.byAnalysisType[type]++
|
||
})
|
||
|
||
this.stats.avgProcessingTimeMs = (this.stats.avgProcessingTimeMs * (this.stats.totalAnalyses - 1) + processingTime) / this.stats.totalAnalyses
|
||
}
|
||
|
||
private isRetryableError(error: any): boolean {
|
||
const retryableCodes = ['TIMEOUT', 'RATE_LIMIT', 'SERVER_ERROR']
|
||
return retryableCodes.includes(error.code) || error.status >= 500
|
||
}
|
||
|
||
// Quality assessment methods
|
||
private async assessFactualAccuracy(content: string): Promise<number> {
|
||
// 检查是否包含可验证的事实
|
||
let score = 0.5
|
||
|
||
// 包含日期
|
||
if (/\d{4}年|\d{1,2}月|\d{1,2}日/.test(content)) score += 0.1
|
||
|
||
// 包含具体数字
|
||
if (/\d+\.?\d*%|\d+万|\d+亿|\d+千/.test(content)) score += 0.1
|
||
|
||
// 包含地点
|
||
if (/市|省|县|区|国|州/.test(content)) score += 0.1
|
||
|
||
// 包含人名或机构名
|
||
if (/先生|女士|部长|主席|公司|集团|大学|医院/.test(content)) score += 0.1
|
||
|
||
return Math.min(1, score)
|
||
}
|
||
|
||
private async assessSourceReliability(sourceUrl: string): Promise<number> {
|
||
if (!sourceUrl) return 0.3
|
||
|
||
// 简单的源可靠性评估
|
||
const reliableDomains = ['gov.cn', 'edu.cn', 'xinhuanet.com', 'people.com.cn', 'cctv.com']
|
||
const domain = sourceUrl.toLowerCase()
|
||
|
||
for (const reliableDomain of reliableDomains) {
|
||
if (domain.includes(reliableDomain)) return 0.9
|
||
}
|
||
|
||
if (domain.includes('.gov') || domain.includes('.edu')) return 0.8
|
||
if (domain.includes('news') || domain.includes('media')) return 0.6
|
||
|
||
return 0.4
|
||
}
|
||
|
||
private async assessWritingQuality(content: string): Promise<number> {
|
||
let score = 0.5
|
||
|
||
// 检查语法和结构
|
||
const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0)
|
||
if (sentences.length > 2) score += 0.1
|
||
|
||
// 检查段落结构
|
||
const paragraphs = content.split('\n\n').filter(p => p.trim().length > 0)
|
||
if (paragraphs.length > 1) score += 0.1
|
||
|
||
// 检查词汇丰富度
|
||
const words = content.split(/\s+/).filter(w => w.length > 0)
|
||
const uniqueWords = new Set(words.map(w => w.toLowerCase()))
|
||
const diversity = uniqueWords.size / words.length
|
||
score += diversity * 0.3
|
||
|
||
return Math.min(1, score)
|
||
}
|
||
|
||
private async assessObjectivity(content: string): Promise<number> {
|
||
let score = 0.7 // 基础客观性分数
|
||
|
||
// 主观词汇惩罚
|
||
const subjectiveWords = ['我认为', '个人觉得', '显然', '明显', '无疑', '肯定']
|
||
const subjectiveCount = subjectiveWords.filter(word => content.includes(word)).length
|
||
score -= subjectiveCount * 0.1
|
||
|
||
// 情感词汇惩罚
|
||
const emotionalWords = ['愤怒', '激动', '兴奋', '失望', '震惊', '惊喜']
|
||
const emotionalCount = emotionalWords.filter(word => content.includes(word)).length
|
||
score -= emotionalCount * 0.05
|
||
|
||
return Math.max(0, Math.min(1, score))
|
||
}
|
||
|
||
private async assessCompleteness(content: string): Promise<number> {
|
||
let score = 0.3
|
||
|
||
// 基于内容长度
|
||
if (content.length > 200) score += 0.2
|
||
if (content.length > 500) score += 0.2
|
||
if (content.length > 1000) score += 0.2
|
||
|
||
// 包含关键新闻要素(5W1H)
|
||
const hasWho = /人|者|员|家|国|公司|组织/.test(content)
|
||
const hasWhat = /事件|活动|发生|进行|宣布|决定/.test(content)
|
||
const hasWhen = /\d{4}年|\d{1,2}月|\d{1,2}日|今天|昨天|明天/.test(content)
|
||
const hasWhere = /市|省|县|区|国|地区|地点/.test(content)
|
||
const hasWhy = /因为|由于|原因|目的|为了/.test(content)
|
||
|
||
const elements = [hasWho, hasWhat, hasWhen, hasWhere, hasWhy].filter(Boolean).length
|
||
score += elements * 0.06
|
||
|
||
return Math.min(1, score)
|
||
}
|
||
|
||
private async assessTimeliness(publishedAt: number): Promise<number> {
|
||
const now = Date.now()
|
||
const ageHours = (now - publishedAt) / (1000 * 60 * 60)
|
||
|
||
// 新闻越新,时效性越高
|
||
if (ageHours < 1) return 1.0
|
||
if (ageHours < 6) return 0.9
|
||
if (ageHours < 24) return 0.7
|
||
if (ageHours < 72) return 0.5
|
||
if (ageHours < 168) return 0.3
|
||
return 0.1
|
||
}
|
||
|
||
private async assessRelevance(content: string, categoryId?: string): Promise<number> {
|
||
if (!categoryId) return 0.5
|
||
|
||
// 根据分类检查相关性
|
||
const category = this.NEWS_CATEGORIES.find(c => c.id === categoryId)
|
||
if (!category) return 0.5
|
||
|
||
const matches = category.keywords.filter(keyword =>
|
||
content.toLowerCase().includes(keyword.toLowerCase())
|
||
).length
|
||
|
||
return Math.min(1, matches / category.keywords.length + 0.3)
|
||
}
|
||
|
||
// Mock AI service methods
|
||
private async analyzeSentimentWithOpenAI(content: string): Promise<number> {
|
||
// 模拟OpenAI情感分析
|
||
await this.delay(Math.random() * 500 + 200)
|
||
|
||
// 简单的情感检测
|
||
const positiveWords = ['好', '棒', '优秀', '成功', '胜利', '喜悦', '高兴', '满意']
|
||
const negativeWords = ['坏', '糟糕', '失败', '问题', '困难', '悲伤', '愤怒', '失望']
|
||
|
||
const positiveCount = positiveWords.filter(word => content.includes(word)).length
|
||
const negativeCount = negativeWords.filter(word => content.includes(word)).length
|
||
|
||
const score = (positiveCount - negativeCount) / Math.max(positiveCount + negativeCount, 1)
|
||
return Math.max(-1, Math.min(1, score))
|
||
}
|
||
|
||
private async analyzeSentimentWithGoogle(content: string): Promise<number> {
|
||
await this.delay(Math.random() * 400 + 150)
|
||
return Math.random() * 2 - 1 // -1 to 1
|
||
}
|
||
|
||
private async analyzeSentimentWithBaidu(content: string): Promise<number> {
|
||
await this.delay(Math.random() * 300 + 100)
|
||
return Math.random() * 2 - 1
|
||
}
|
||
|
||
private analyzeSentimentBasic(content: string): number {
|
||
// 基础情感分析
|
||
const positiveWords = ['好', '棒', '优秀', '成功', '胜利', 'great', 'good', 'excellent']
|
||
const negativeWords = ['坏', '糟糕', '失败', '问题', 'bad', 'terrible', 'awful']
|
||
|
||
const positiveCount = positiveWords.filter(word => content.toLowerCase().includes(word)).length
|
||
const negativeCount = negativeWords.filter(word => content.toLowerCase().includes(word)).length
|
||
|
||
return (positiveCount - negativeCount) / Math.max(positiveCount + negativeCount, 1)
|
||
}
|
||
|
||
private async extractEntitiesWithOpenAI(content: string): Promise<EntityResult[]> {
|
||
await this.delay(Math.random() * 600 + 300)
|
||
|
||
// 模拟实体提取
|
||
const entities: EntityResult[] = []
|
||
const patterns = [
|
||
{ regex: /[\u4e00-\u9fa5]{2,4}(公司|集团|企业|机构)/g, type: 'organization' as const },
|
||
{ regex: /[\u4e00-\u9fa5]{2,3}(市|省|县|区)/g, type: 'location' as const },
|
||
{ regex: /[\u4e00-\u9fa5]{2,4}(先生|女士|部长|主席|总裁|经理)/g, type: 'person' as const },
|
||
{ regex: /\d{4}年\d{1,2}月\d{1,2}日/g, type: 'date' as const },
|
||
{ regex: /\d+\.?\d*(万|亿|千)?(元|美元|英镑)/g, type: 'money' as const }
|
||
]
|
||
|
||
patterns.forEach(pattern => {
|
||
const matches = content.matchAll(pattern.regex)
|
||
for (const match of matches) {
|
||
entities.push({
|
||
text: match[0],
|
||
type: pattern.type,
|
||
confidence: 0.8 + Math.random() * 0.2,
|
||
startPosition: match.index || 0,
|
||
endPosition: (match.index || 0) + match[0].length
|
||
})
|
||
}
|
||
})
|
||
|
||
return entities.slice(0, 10)
|
||
}
|
||
|
||
private async extractEntitiesWithGoogle(content: string): Promise<EntityResult[]> {
|
||
await this.delay(Math.random() * 500 + 250)
|
||
return this.extractEntitiesBasic(content)
|
||
}
|
||
|
||
private extractEntitiesBasic(content: string): EntityResult[] {
|
||
// 基础实体提取
|
||
const entities: EntityResult[] = []
|
||
|
||
// 提取组织
|
||
const orgMatches = content.matchAll(/[\u4e00-\u9fa5]{2,4}(公司|集团)/g)
|
||
for (const match of orgMatches) {
|
||
entities.push({
|
||
text: match[0],
|
||
type: 'organization',
|
||
confidence: 0.7,
|
||
startPosition: match.index || 0,
|
||
endPosition: (match.index || 0) + match[0].length
|
||
})
|
||
}
|
||
|
||
return entities
|
||
}
|
||
|
||
private async extractTopicsWithOpenAI(content: string): Promise<TopicResult[]> {
|
||
await this.delay(Math.random() * 400 + 200)
|
||
|
||
// 基于关键词聚类的主题提取
|
||
const topics: TopicResult[] = []
|
||
|
||
for (const category of this.NEWS_CATEGORIES.slice(0, 3)) {
|
||
const matches = category.keywords.filter(keyword =>
|
||
content.toLowerCase().includes(keyword.toLowerCase())
|
||
)
|
||
|
||
if (matches.length > 0) {
|
||
topics.push({
|
||
name: category.name,
|
||
confidence: matches.length / category.keywords.length,
|
||
keywords: matches
|
||
})
|
||
}
|
||
}
|
||
|
||
return topics.sort((a, b) => b.confidence - a.confidence)
|
||
}
|
||
|
||
private extractTopicsBasic(content: string): TopicResult[] {
|
||
// 基础主题提取
|
||
const topics: TopicResult[] = []
|
||
|
||
// 检查科技相关关键词
|
||
const techKeywords = ['科技', '技术', '互联网', 'AI', '人工智能']
|
||
const techMatches = techKeywords.filter(keyword => content.includes(keyword))
|
||
|
||
if (techMatches.length > 0) {
|
||
topics.push({
|
||
name: '科技',
|
||
confidence: techMatches.length / techKeywords.length,
|
||
keywords: techMatches
|
||
})
|
||
}
|
||
|
||
return topics
|
||
}
|
||
}
|