Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Multimodal News Analysis Example with Firecrawl #21

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/firecrawl_news_analysis/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Firecrawl API Configuration
FIRECRAWL_API_KEY=your_firecrawl_api_key

# News API Configuration
NEWSAPI_KEY=your_newsapi_key
NYT_API_KEY=your_new_york_times_api_key
36 changes: 36 additions & 0 deletions examples/firecrawl_news_analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Firecrawl News Analysis Example

## Overview
A comprehensive example demonstrating web scraping, data processing, and analysis of news articles using Firecrawl.

## Features
- Multi-source news article retrieval
- Intelligent web scraping with Firecrawl
- Sentiment analysis
- Keyword extraction
- Flexible search capabilities

## Prerequisites
- Python 3.9+
- Firecrawl API Key

## Setup
1. Clone the repository
2. Install dependencies: `pip install -r requirements.txt`
3. Create a `.env` file with your API keys:
```
FIRECRAWL_API_KEY=your_key_here
NEWSAPI_KEY=your_key_here
```
4. Run the application: `python main.py`

## Usage
```bash
python main.py --query "AI Technology" --from-date 2024-01-01 --to-date 2024-02-01
```

## Contributing
Contributions are welcome! Please read our contributing guidelines.

## License
MIT License
36 changes: 36 additions & 0 deletions examples/firecrawl_news_analysis/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# API Configuration
API_CONFIG = {
'firecrawl': {
'api_key': os.getenv('FIRECRAWL_API_KEY'),
'base_url': 'https://api.firecrawl.dev/v0'
},
'newsapi': {
'api_key': os.getenv('NEWSAPI_KEY'),
'base_url': 'https://newsapi.org/v2'
},
'nyt': {
'api_key': os.getenv('NYT_API_KEY'),
'base_url': 'https://api.nytimes.com/svc/search/v2'
}
}

# Default search parameters
DEFAULT_SEARCH_PARAMS = {
'query': 'technology',
'from_date': '2024-01-01',
'to_date': '2024-02-01',
'language': 'en',
'sources': ['techcrunch', 'wired', 'the-verge']
}

# Logging configuration
LOGGING_CONFIG = {
'level': 'INFO',
'format': '%(asctime)s - %(levelname)s: %(message)s'
}
90 changes: 90 additions & 0 deletions examples/firecrawl_news_analysis/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
import logging
from typing import List, Dict

from config import DEFAULT_SEARCH_PARAMS, LOGGING_CONFIG
from utils.helpers import validate_date
from api_clients.firecrawl_client import FirecrawlNewsClient
from data_processing.analyzer import NewsAnalyzer

def setup_logging():
"""Configure logging for the application."""
logging.basicConfig(
level=LOGGING_CONFIG['level'],
format=LOGGING_CONFIG['format']
)

def parse_arguments():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description='Firecrawl News Analysis Tool')
parser.add_argument('--query', default=DEFAULT_SEARCH_PARAMS['query'],
help='Search query for news articles')
parser.add_argument('--from-date', default=DEFAULT_SEARCH_PARAMS['from_date'],
help='Start date for news search (YYYY-MM-DD)')
parser.add_argument('--to-date', default=DEFAULT_SEARCH_PARAMS['to_date'],
help='End date for news search (YYYY-MM-DD)')

return parser.parse_args()

def main():
"""
Main application workflow for news analysis.

Steps:
1. Parse arguments
2. Validate dates
3. Fetch news articles using Firecrawl
4. Perform analysis
5. Display results
"""
setup_logging()
logger = logging.getLogger(__name__)

try:
# Parse command-line arguments
args = parse_arguments()

# Validate and standardize dates
from_date = validate_date(args.from_date)
to_date = validate_date(args.to_date)

# Initialize Firecrawl client
firecrawl_client = FirecrawlNewsClient()

# Fetch news articles
logger.info(f"Searching news for query: {args.query}")
news_articles = firecrawl_client.search_news(
query=args.query,
from_date=from_date,
to_date=to_date
)

# Perform news analysis
analyzer = NewsAnalyzer(news_articles)
analysis_results = {
'total_articles': len(news_articles),
'sentiment_summary': analyzer.analyze_sentiment(),
'top_keywords': analyzer.extract_top_keywords(),
'source_distribution': analyzer.analyze_sources()
}

# Display results
print("\n--- News Analysis Results ---")
print(f"Total Articles: {analysis_results['total_articles']}")
print("\nSentiment Summary:")
for sentiment, count in analysis_results['sentiment_summary'].items():
print(f"{sentiment.capitalize()}: {count}")

print("\nTop Keywords:")
for keyword, freq in analysis_results['top_keywords'].items():
print(f"{keyword}: {freq}")

print("\nNews Sources:")
for source, count in analysis_results['source_distribution'].items():
print(f"{source}: {count}")

except Exception as e:
logger.error(f"An error occurred: {e}")

if __name__ == "__main__":
main()
15 changes: 15 additions & 0 deletions examples/firecrawl_news_analysis/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Core Dependencies
firecrawl-py==1.9.0
requests==2.31.0
python-dotenv==1.0.0

# Data Processing
pandas==2.1.4
textblob==0.17.1

# Optional: Advanced NLP
spacy==3.7.4
nltk==3.8.1

# Testing
pytest==7.4.4
75 changes: 75 additions & 0 deletions examples/firecrawl_news_analysis/utils/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import re
from datetime import datetime, timedelta
from typing import Dict, Any

def validate_date(date_str: str) -> str:
"""
Validate and standardize date input.

Args:
date_str (str): Input date string

Returns:
str: Validated date in YYYY-MM-DD format
"""
try:
# Try parsing the date in various formats
for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d-%m-%Y']:
try:
parsed_date = datetime.strptime(date_str, fmt)
return parsed_date.strftime('%Y-%m-%d')
except ValueError:
continue

# If no format matches, raise ValueError
raise ValueError(f"Invalid date format: {date_str}")

except Exception as e:
print(f"Date validation error: {e}")
# Default to current date if validation fails
return datetime.now().strftime('%Y-%m-%d')

def clean_text(text: str) -> str:
"""
Clean and normalize text.

Args:
text (str): Input text to clean

Returns:
str: Cleaned text
"""
if not text:
return ""

# Remove special characters and extra whitespaces
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()

return text.lower()

def extract_keywords(text: str, top_n: int = 10) -> Dict[str, Any]:
"""
Extract top keywords from text.

Args:
text (str): Input text
top_n (int): Number of top keywords to return

Returns:
Dict: Dictionary of top keywords with their frequencies
"""
from collections import Counter
import spacy

try:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

# Filter for nouns and proper nouns
keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]

return dict(Counter(keywords).most_common(top_n))
except Exception as e:
print(f"Keyword extraction error: {e}")
return {}