discourse/analysis/app/insight_generator.py
2025-03-25 03:52:30 -04:00

185 lines
6.3 KiB
Python

"""
Insight Generator module for VoxPop.
Generates structured insights from analyzed perspective data.
"""
import logging
import json
from typing import Dict, List, Any
import time
import os
from datetime import datetime
logger = logging.getLogger(__name__)
class InsightGenerator:
"""
Generates structured insights from analyzed perspective data.
"""
def __init__(self, output_dir: str = "insights"):
"""
Initialize the insight generator.
Args:
output_dir: Directory to store generated insights
"""
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Insight generator initialized with output directory: {output_dir}")
def generate_insights(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
"""
Generate structured insights from analysis results.
Args:
analysis_results: Results from the analyzer
Returns:
Structured insights
"""
logger.info("Generating insights from analysis results")
# Extract insights from analysis results
raw_insights = analysis_results.get("insights", [])
# Format insights and filter out low confidence ones
structured_insights = []
for insight in raw_insights:
summary = insight.get("summary", "")
confidence = insight.get("confidence", 0.0)
# Filter out low confidence insights
if confidence < 0.05:
continue
structured_insights.append({
"summary": summary,
"confidence": confidence,
"timestamp": datetime.now().isoformat()
})
# Save insights to file
timestamp = int(time.time())
output_path = os.path.join(self.output_dir, f"insights_{timestamp}.json")
insights_data = {
"insights": structured_insights,
"metadata": {
"timestamp": timestamp,
"perspective_count": len(analysis_results.get("perspectives", [])),
"generated_at": datetime.now().isoformat()
}
}
try:
with open(output_path, 'w') as f:
json.dump(insights_data, f, indent=2)
logger.info(f"Insights saved to {output_path}")
except Exception as e:
logger.error(f"Error saving insights to file: {e}")
return insights_data
def get_recent_insights(self, count: int = 10) -> List[Dict[str, Any]]:
"""
Get the most recent insights.
Args:
count: Number of recent insight files to retrieve
Returns:
List of insight dictionaries
"""
insights = []
try:
# Get all insight files
files = [os.path.join(self.output_dir, f) for f in os.listdir(self.output_dir)
if f.startswith("insights_") and f.endswith(".json")]
# Sort by modification time (most recent first)
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
# Load the most recent files
for file_path in files[:count]:
try:
with open(file_path, 'r') as f:
data = json.load(f)
insights.append(data)
except Exception as e:
logger.error(f"Error loading insight file {file_path}: {e}")
except Exception as e:
logger.error(f"Error retrieving recent insights: {e}")
return insights
def get_consolidated_insights(self, days: int = 1) -> Dict[str, Any]:
"""
Consolidate insights from a recent time period.
Args:
days: Number of days to look back
Returns:
Consolidated insights
"""
# Get all insight files
all_insights = []
cutoff_time = time.time() - (days * 24 * 60 * 60)
try:
files = [os.path.join(self.output_dir, f) for f in os.listdir(self.output_dir)
if f.startswith("insights_") and f.endswith(".json")]
for file_path in files:
# Check if file is within the time period
if os.path.getmtime(file_path) >= cutoff_time:
try:
with open(file_path, 'r') as f:
data = json.load(f)
all_insights.extend(data.get("insights", []))
except Exception as e:
logger.error(f"Error loading insight file {file_path}: {e}")
except Exception as e:
logger.error(f"Error consolidating insights: {e}")
# Group similar insights (simplified approach)
consolidated = {}
for insight in all_insights:
summary = insight.get("summary", "")
# Use first few words as a key to group similar insights
key_words = " ".join(summary.split()[:3])
if key_words in consolidated:
# Update confidence with max value
consolidated[key_words]["confidence"] = max(
consolidated[key_words]["confidence"],
insight.get("confidence", 0.0)
)
consolidated[key_words]["count"] += 1
else:
consolidated[key_words] = {
"summary": summary,
"confidence": insight.get("confidence", 0.0),
"count": 1,
"last_seen": insight.get("timestamp", datetime.now().isoformat())
}
# Convert to list and sort by confidence
result = list(consolidated.values())
result.sort(key=lambda x: x["confidence"], reverse=True)
return {
"consolidated_insights": result,
"metadata": {
"period_days": days,
"total_insights": len(all_insights),
"consolidated_count": len(result),
"generated_at": datetime.now().isoformat()
}
}