Skip to content
Closed
Prev Previous commit
Next Next commit
feat: implement claim extraction, clustering, and triangulated summary
  • Loading branch information
raydeStar committed Dec 9, 2025
commit ede3d021220a79f11bc7f2fe6b338f4579e2aa2d
328 changes: 307 additions & 21 deletions src/lib/search/newsTriangulate.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import EventEmitter from 'events';
import { randomUUID } from 'node:crypto';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import { MetaSearchAgentType } from './metaSearchAgent';
import { searchSearxng } from '../searxng';
import { Lane, NewsSource, TriangulatedNewsResult } from '@/types/newsTriangulate';
import computeSimilarity from '../utils/computeSimilarity';
import {
Lane,
NewsSource,
NewsClaim,
ClaimCluster,
TriangulatedNewsResult,
} from '@/types/newsTriangulate';

type FetchOptions = {
engines?: string[];
Expand Down Expand Up @@ -145,23 +154,286 @@ export const selectBalancedNewsSources = (
return balanced;
};

const buildPlaceholderResult = (sources: NewsSource[]): TriangulatedNewsResult => {
const laneCounts = sources.reduce<Record<Lane, number>>((acc, source) => {
const lane = source.lane ?? 'UNKNOWN';
acc[lane] = (acc[lane] ?? 0) + 1;
return acc;
}, {} as Record<Lane, number>);
/**
* Extracts structured claims from a news source using the LLM.
* Returns an array of claims with metadata (who, when, where, type).
*/
const extractClaimsFromSource = async (
source: NewsSource,
llm: BaseChatModel,
): Promise<NewsClaim[]> => {
const prompt = `You are a claim extraction system. Extract factual claims from this news article.

Title: ${source.title}
Content: ${source.snippet || 'No content available'}

Return a JSON array of claims. Each claim should have:
- "text": the claim statement (1-2 sentences)
- "who": who is making or involved in the claim (if mentioned)
- "when": when did this happen (if mentioned)
- "where": location (if mentioned)
- "type": one of "fact", "opinion", "quote", or "other"
- "confidence": "high" if directly stated, "medium" if implied, "low" if uncertain

Return ONLY valid JSON array. Example:
[{"text":"The president signed the bill","who":"president","when":"today","where":"White House","type":"fact","confidence":"high"}]

If no clear claims can be extracted, return an empty array: []`;

try {
const response = await llm.invoke(prompt);
const content =
typeof response.content === 'string'
? response.content
: JSON.stringify(response.content);

// Extract JSON from response (handle markdown code blocks)
const jsonMatch = content.match(/\[[\s\S]*\]/);
if (!jsonMatch) return [];

const parsed = JSON.parse(jsonMatch[0]) as Array<{
text: string;
who?: string;
when?: string;
where?: string;
type?: string;
confidence?: string;
}>;

return parsed.map((claim) => ({
id: randomUUID(),
sourceId: source.id,
lane: source.lane,
text: claim.text,
who: claim.who,
when: claim.when,
where: claim.where,
claimType: (['fact', 'opinion', 'quote', 'other'].includes(claim.type || '')
? claim.type
: 'other') as NewsClaim['claimType'],
confidence: (['low', 'medium', 'high'].includes(claim.confidence || '')
? claim.confidence
: 'medium') as NewsClaim['confidence'],
}));
} catch (err) {
console.error(`Claim extraction failed for source ${source.id}:`, err);
return [];
}
};

/**
* Embeds all claims and clusters them by similarity.
* Returns clusters with supporting/disagreeing claims and lane coverage.
*/
const buildClaimClusters = async (
claims: NewsClaim[],
embeddings: Embeddings,
similarityThreshold = 0.75,
): Promise<ClaimCluster[]> => {
if (claims.length === 0) return [];

// Embed all claim texts
const claimTexts = claims.map((c) => c.text);
const vectors = await embeddings.embedDocuments(claimTexts);

// Track which claims have been clustered
const clustered = new Set<number>();
const clusters: ClaimCluster[] = [];

for (let i = 0; i < claims.length; i++) {
if (clustered.has(i)) continue;

const cluster: NewsClaim[] = [claims[i]];
clustered.add(i);

// Find similar claims
for (let j = i + 1; j < claims.length; j++) {
if (clustered.has(j)) continue;

const similarity = computeSimilarity(vectors[i], vectors[j]);
if (similarity >= similarityThreshold) {
cluster.push(claims[j]);
clustered.add(j);
}
}

// Build cluster metadata
const lanes = [...new Set(cluster.map((c) => c.lane).filter(Boolean))] as Lane[];
const uniqueSources = new Set(cluster.map((c) => c.sourceId));

// Determine agreement level based on source diversity
let agreementLevel: ClaimCluster['agreementLevel'] = 'low';
if (uniqueSources.size >= 3 && lanes.length >= 2) {
agreementLevel = 'high';
} else if (uniqueSources.size >= 2) {
agreementLevel = 'medium';
}

clusters.push({
clusterId: randomUUID(),
representativeText: cluster[0].text,
supportingClaims: cluster.map((c) => ({
claimId: c.id,
sourceId: c.sourceId,
lane: c.lane,
})),
lanesCovered: lanes.length > 0 ? lanes : ['UNKNOWN'],
agreementLevel,
});
}

return clusters;
};

/**
* Categorizes clusters into shared facts, conflicts, and unique angles.
*/
const categorizeClaimClusters = (
clusters: ClaimCluster[],
): {
sharedFacts: ClaimCluster[];
conflicts: ClaimCluster[];
uniqueAngles: ClaimCluster[];
} => {
const sharedFacts: ClaimCluster[] = [];
const conflicts: ClaimCluster[] = [];
const uniqueAngles: ClaimCluster[] = [];

for (const cluster of clusters) {
const sourceCount = cluster.supportingClaims.length;
const laneCount = cluster.lanesCovered.length;

// Shared facts: multiple sources across multiple lanes
if (sourceCount >= 2 && laneCount >= 2) {
sharedFacts.push(cluster);
}
// Unique angles: single source or single lane dominance
else if (sourceCount === 1 || laneCount === 1) {
uniqueAngles.push(cluster);
}
// Conflicts: detected disagreement (future enhancement)
else {
conflicts.push(cluster);
}
}

// Sort by agreement level and source count
const sortByRelevance = (a: ClaimCluster, b: ClaimCluster) => {
const levelOrder = { high: 0, medium: 1, low: 2 };
const levelDiff = levelOrder[a.agreementLevel] - levelOrder[b.agreementLevel];
if (levelDiff !== 0) return levelDiff;
return b.supportingClaims.length - a.supportingClaims.length;
};

sharedFacts.sort(sortByRelevance);
uniqueAngles.sort(sortByRelevance);

return { sharedFacts, conflicts, uniqueAngles };
};

/**
* Generates a neutral summary of the triangulated news using the LLM.
*/
const generateNeutralSummary = async (
sharedFacts: ClaimCluster[],
conflicts: ClaimCluster[],
uniqueAngles: ClaimCluster[],
laneCounts: Record<Lane, number>,
llm: BaseChatModel,
): Promise<string> => {
const sharedFactsText = sharedFacts
.slice(0, 5)
.map((c) => `- ${c.representativeText}`)
.join('\n');

const uniqueAnglesText = uniqueAngles
.slice(0, 3)
.map((c) => `- ${c.representativeText} (${c.lanesCovered.join(', ')})`)
.join('\n');

const laneBreakdown = Object.entries(laneCounts)
.filter(([, count]) => count > 0)
.map(([lane, count]) => `${lane}: ${count}`)
.join(', ');

const prompt = `You are a neutral news summarizer. Based on the following claims extracted from multiple news sources across different political perspectives, write a balanced 2-3 sentence summary.

Source distribution: ${laneBreakdown}

Shared facts (reported by multiple sources):
${sharedFactsText || 'None identified'}

Unique angles (reported by single sources):
${uniqueAnglesText || 'None identified'}

Guidelines:
- Be neutral and factual
- Acknowledge uncertainty where sources disagree
- Do not favor any political perspective
- Keep it concise (2-3 sentences)

Summary:`;

try {
const response = await llm.invoke(prompt);
const content =
typeof response.content === 'string'
? response.content
: String(response.content);
return content.trim();
} catch (err) {
console.error('Summary generation failed:', err);
return 'Unable to generate summary. Please review the sources below for details.';
}
};

/**
* Builds the full triangulated news result from sources, claims, and clusters.
*/
const buildTriangulatedResult = async (
sources: NewsSource[],
llm: BaseChatModel,
embeddings: Embeddings,
): Promise<TriangulatedNewsResult> => {
// Extract claims from all sources (parallel)
const claimArrays = await Promise.all(
sources.map((source) => extractClaimsFromSource(source, llm)),
);
const allClaims = claimArrays.flat();

// Build claim clusters
const clusters = await buildClaimClusters(allClaims, embeddings);

// Categorize clusters
const { sharedFacts, conflicts, uniqueAngles } = categorizeClaimClusters(clusters);

// Calculate lane counts
const laneCounts = sources.reduce<Record<Lane, number>>(
(acc, source) => {
const lane = source.lane ?? 'UNKNOWN';
acc[lane] = (acc[lane] ?? 0) + 1;
return acc;
},
{ LEFT: 0, RIGHT: 0, CENTER: 0, UNKNOWN: 0 },
);

// Generate neutral summary
const summary = await generateNeutralSummary(
sharedFacts,
conflicts,
uniqueAngles,
laneCounts,
llm,
);

return {
summary:
'Triangulated news placeholder: fetching diversified sources and preparing claim graph scaffolding.',
sharedFacts: [],
conflicts: [],
uniqueAngles: [],
lanes: Object.entries(laneCounts).map(([lane, count]) => ({
lane: lane as Lane,
count,
})),
summary,
sharedFacts: sharedFacts.slice(0, 10),
conflicts: conflicts.slice(0, 5),
uniqueAngles: uniqueAngles.slice(0, 5),
lanes: Object.entries(laneCounts)
.filter(([, count]) => count > 0)
.map(([lane, count]) => ({ lane: lane as Lane, count })),
sources,
};
};
Expand All @@ -170,8 +442,8 @@ export class NewsTriangulationAgent implements MetaSearchAgentType {
async searchAndAnswer(
message: string,
_history: any[],
_llm: any,
_embeddings: any,
llm: BaseChatModel,
embeddings: Embeddings,
_optimizationMode: any,
_fileIds: string[],
_systemInstructions: string,
Expand All @@ -180,6 +452,7 @@ export class NewsTriangulationAgent implements MetaSearchAgentType {

queueMicrotask(async () => {
try {
// Phase 1: Fetch and normalize news sources
const sources = await fetchNormalizedNewsResults(message, {
engines: ['bing news'],
pageno: 1,
Expand All @@ -188,23 +461,36 @@ export class NewsTriangulationAgent implements MetaSearchAgentType {
perDomainCap: 2,
});

// Phase 2: Tag lanes and balance sources
const tagged = withLaneTags(sources);
const balanced = selectBalancedNewsSources(tagged);

const placeholder = buildPlaceholderResult(balanced);
// Phase 3/4: Extract claims, cluster, and build triangulated result
const result = await buildTriangulatedResult(balanced, llm, embeddings);

// Emit structured response
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: placeholder.summary }),
JSON.stringify({ type: 'response', data: result.summary }),
);

emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: placeholder.sources }),
JSON.stringify({
type: 'sources',
data: {
sources: result.sources,
sharedFacts: result.sharedFacts,
conflicts: result.conflicts,
uniqueAngles: result.uniqueAngles,
lanes: result.lanes,
},
}),
);

emitter.emit('end');
} catch (error) {
console.error('News triangulation failed:', error);
emitter.emit('error', error);
}
});
Expand Down