From 8fdf152f2734ac2fe0c25cd1b09b1da95c775a3c Mon Sep 17 00:00:00 2001 From: Cirroe Date: Fri, 17 Jan 2025 19:22:46 -0800 Subject: [PATCH] feat: Add multimodal news analysis example with Firecrawl, showcasing advanced web crawling and NLP techniques --- examples/firecrawl_news_analysis/.env.example | 6 ++ examples/firecrawl_news_analysis/README.md | 36 ++++++++ examples/firecrawl_news_analysis/config.py | 36 ++++++++ examples/firecrawl_news_analysis/main.py | 90 +++++++++++++++++++ .../firecrawl_news_analysis/requirements.txt | 15 ++++ .../firecrawl_news_analysis/utils/helpers.py | 75 ++++++++++++++++ 6 files changed, 258 insertions(+) create mode 100644 examples/firecrawl_news_analysis/.env.example create mode 100644 examples/firecrawl_news_analysis/README.md create mode 100644 examples/firecrawl_news_analysis/config.py create mode 100644 examples/firecrawl_news_analysis/main.py create mode 100644 examples/firecrawl_news_analysis/requirements.txt create mode 100644 examples/firecrawl_news_analysis/utils/helpers.py diff --git a/examples/firecrawl_news_analysis/.env.example b/examples/firecrawl_news_analysis/.env.example new file mode 100644 index 0000000..058a868 --- /dev/null +++ b/examples/firecrawl_news_analysis/.env.example @@ -0,0 +1,6 @@ +# Firecrawl API Configuration +FIRECRAWL_API_KEY=your_firecrawl_api_key + +# News API Configuration +NEWSAPI_KEY=your_newsapi_key +NYT_API_KEY=your_new_york_times_api_key \ No newline at end of file diff --git a/examples/firecrawl_news_analysis/README.md b/examples/firecrawl_news_analysis/README.md new file mode 100644 index 0000000..aefca9a --- /dev/null +++ b/examples/firecrawl_news_analysis/README.md @@ -0,0 +1,36 @@ +# Firecrawl News Analysis Example + +## Overview +A comprehensive example demonstrating web scraping, data processing, and analysis of news articles using Firecrawl. + +## Features +- Multi-source news article retrieval +- Intelligent web scraping with Firecrawl +- Sentiment analysis +- Keyword extraction +- Flexible search capabilities + +## Prerequisites +- Python 3.9+ +- Firecrawl API Key + +## Setup +1. Clone the repository +2. Install dependencies: `pip install -r requirements.txt` +3. Create a `.env` file with your API keys: + ``` + FIRECRAWL_API_KEY=your_key_here + NEWSAPI_KEY=your_key_here + ``` +4. Run the application: `python main.py` + +## Usage +```bash +python main.py --query "AI Technology" --from-date 2024-01-01 --to-date 2024-02-01 +``` + +## Contributing +Contributions are welcome! Please read our contributing guidelines. + +## License +MIT License \ No newline at end of file diff --git a/examples/firecrawl_news_analysis/config.py b/examples/firecrawl_news_analysis/config.py new file mode 100644 index 0000000..21a33d8 --- /dev/null +++ b/examples/firecrawl_news_analysis/config.py @@ -0,0 +1,36 @@ +import os +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# API Configuration +API_CONFIG = { + 'firecrawl': { + 'api_key': os.getenv('FIRECRAWL_API_KEY'), + 'base_url': 'https://api.firecrawl.dev/v0' + }, + 'newsapi': { + 'api_key': os.getenv('NEWSAPI_KEY'), + 'base_url': 'https://newsapi.org/v2' + }, + 'nyt': { + 'api_key': os.getenv('NYT_API_KEY'), + 'base_url': 'https://api.nytimes.com/svc/search/v2' + } +} + +# Default search parameters +DEFAULT_SEARCH_PARAMS = { + 'query': 'technology', + 'from_date': '2024-01-01', + 'to_date': '2024-02-01', + 'language': 'en', + 'sources': ['techcrunch', 'wired', 'the-verge'] +} + +# Logging configuration +LOGGING_CONFIG = { + 'level': 'INFO', + 'format': '%(asctime)s - %(levelname)s: %(message)s' +} \ No newline at end of file diff --git a/examples/firecrawl_news_analysis/main.py b/examples/firecrawl_news_analysis/main.py new file mode 100644 index 0000000..fab2e6f --- /dev/null +++ b/examples/firecrawl_news_analysis/main.py @@ -0,0 +1,90 @@ +import argparse +import logging +from typing import List, Dict + +from config import DEFAULT_SEARCH_PARAMS, LOGGING_CONFIG +from utils.helpers import validate_date +from api_clients.firecrawl_client import FirecrawlNewsClient +from data_processing.analyzer import NewsAnalyzer + +def setup_logging(): + """Configure logging for the application.""" + logging.basicConfig( + level=LOGGING_CONFIG['level'], + format=LOGGING_CONFIG['format'] + ) + +def parse_arguments(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description='Firecrawl News Analysis Tool') + parser.add_argument('--query', default=DEFAULT_SEARCH_PARAMS['query'], + help='Search query for news articles') + parser.add_argument('--from-date', default=DEFAULT_SEARCH_PARAMS['from_date'], + help='Start date for news search (YYYY-MM-DD)') + parser.add_argument('--to-date', default=DEFAULT_SEARCH_PARAMS['to_date'], + help='End date for news search (YYYY-MM-DD)') + + return parser.parse_args() + +def main(): + """ + Main application workflow for news analysis. + + Steps: + 1. Parse arguments + 2. Validate dates + 3. Fetch news articles using Firecrawl + 4. Perform analysis + 5. Display results + """ + setup_logging() + logger = logging.getLogger(__name__) + + try: + # Parse command-line arguments + args = parse_arguments() + + # Validate and standardize dates + from_date = validate_date(args.from_date) + to_date = validate_date(args.to_date) + + # Initialize Firecrawl client + firecrawl_client = FirecrawlNewsClient() + + # Fetch news articles + logger.info(f"Searching news for query: {args.query}") + news_articles = firecrawl_client.search_news( + query=args.query, + from_date=from_date, + to_date=to_date + ) + + # Perform news analysis + analyzer = NewsAnalyzer(news_articles) + analysis_results = { + 'total_articles': len(news_articles), + 'sentiment_summary': analyzer.analyze_sentiment(), + 'top_keywords': analyzer.extract_top_keywords(), + 'source_distribution': analyzer.analyze_sources() + } + + # Display results + print("\n--- News Analysis Results ---") + print(f"Total Articles: {analysis_results['total_articles']}") + print("\nSentiment Summary:") + for sentiment, count in analysis_results['sentiment_summary'].items(): + print(f"{sentiment.capitalize()}: {count}") + + print("\nTop Keywords:") + for keyword, freq in analysis_results['top_keywords'].items(): + print(f"{keyword}: {freq}") + + print("\nNews Sources:") + for source, count in analysis_results['source_distribution'].items(): + print(f"{source}: {count}") + + except Exception as e: + logger.error(f"An error occurred: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/firecrawl_news_analysis/requirements.txt b/examples/firecrawl_news_analysis/requirements.txt new file mode 100644 index 0000000..d479b23 --- /dev/null +++ b/examples/firecrawl_news_analysis/requirements.txt @@ -0,0 +1,15 @@ +# Core Dependencies +firecrawl-py==1.9.0 +requests==2.31.0 +python-dotenv==1.0.0 + +# Data Processing +pandas==2.1.4 +textblob==0.17.1 + +# Optional: Advanced NLP +spacy==3.7.4 +nltk==3.8.1 + +# Testing +pytest==7.4.4 \ No newline at end of file diff --git a/examples/firecrawl_news_analysis/utils/helpers.py b/examples/firecrawl_news_analysis/utils/helpers.py new file mode 100644 index 0000000..e7bd68f --- /dev/null +++ b/examples/firecrawl_news_analysis/utils/helpers.py @@ -0,0 +1,75 @@ +import re +from datetime import datetime, timedelta +from typing import Dict, Any + +def validate_date(date_str: str) -> str: + """ + Validate and standardize date input. + + Args: + date_str (str): Input date string + + Returns: + str: Validated date in YYYY-MM-DD format + """ + try: + # Try parsing the date in various formats + for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d-%m-%Y']: + try: + parsed_date = datetime.strptime(date_str, fmt) + return parsed_date.strftime('%Y-%m-%d') + except ValueError: + continue + + # If no format matches, raise ValueError + raise ValueError(f"Invalid date format: {date_str}") + + except Exception as e: + print(f"Date validation error: {e}") + # Default to current date if validation fails + return datetime.now().strftime('%Y-%m-%d') + +def clean_text(text: str) -> str: + """ + Clean and normalize text. + + Args: + text (str): Input text to clean + + Returns: + str: Cleaned text + """ + if not text: + return "" + + # Remove special characters and extra whitespaces + text = re.sub(r'[^a-zA-Z0-9\s]', '', text) + text = re.sub(r'\s+', ' ', text).strip() + + return text.lower() + +def extract_keywords(text: str, top_n: int = 10) -> Dict[str, Any]: + """ + Extract top keywords from text. + + Args: + text (str): Input text + top_n (int): Number of top keywords to return + + Returns: + Dict: Dictionary of top keywords with their frequencies + """ + from collections import Counter + import spacy + + try: + nlp = spacy.load('en_core_web_sm') + doc = nlp(text) + + # Filter for nouns and proper nouns + keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']] + + return dict(Counter(keywords).most_common(top_n)) + except Exception as e: + print(f"Keyword extraction error: {e}") + return {} \ No newline at end of file