From 8fdf152f2734ac2fe0c25cd1b09b1da95c775a3c Mon Sep 17 00:00:00 2001
From: Cirroe <founders@cirroe.com>
Date: Fri, 17 Jan 2025 19:22:46 -0800
Subject: [PATCH] feat: Add multimodal news analysis example with Firecrawl,
 showcasing advanced web crawling and NLP techniques

---
 examples/firecrawl_news_analysis/.env.example |  6 ++
 examples/firecrawl_news_analysis/README.md    | 36 ++++++++
 examples/firecrawl_news_analysis/config.py    | 36 ++++++++
 examples/firecrawl_news_analysis/main.py      | 90 +++++++++++++++++++
 .../firecrawl_news_analysis/requirements.txt  | 15 ++++
 .../firecrawl_news_analysis/utils/helpers.py  | 75 ++++++++++++++++
 6 files changed, 258 insertions(+)
 create mode 100644 examples/firecrawl_news_analysis/.env.example
 create mode 100644 examples/firecrawl_news_analysis/README.md
 create mode 100644 examples/firecrawl_news_analysis/config.py
 create mode 100644 examples/firecrawl_news_analysis/main.py
 create mode 100644 examples/firecrawl_news_analysis/requirements.txt
 create mode 100644 examples/firecrawl_news_analysis/utils/helpers.py

diff --git a/examples/firecrawl_news_analysis/.env.example b/examples/firecrawl_news_analysis/.env.example
new file mode 100644
index 0000000..058a868
--- /dev/null
+++ b/examples/firecrawl_news_analysis/.env.example
@@ -0,0 +1,6 @@
+# Firecrawl API Configuration
+FIRECRAWL_API_KEY=your_firecrawl_api_key
+
+# News API Configuration
+NEWSAPI_KEY=your_newsapi_key
+NYT_API_KEY=your_new_york_times_api_key
\ No newline at end of file
diff --git a/examples/firecrawl_news_analysis/README.md b/examples/firecrawl_news_analysis/README.md
new file mode 100644
index 0000000..aefca9a
--- /dev/null
+++ b/examples/firecrawl_news_analysis/README.md
@@ -0,0 +1,36 @@
+# Firecrawl News Analysis Example
+
+## Overview
+A comprehensive example demonstrating web scraping, data processing, and analysis of news articles using Firecrawl.
+
+## Features
+- Multi-source news article retrieval
+- Intelligent web scraping with Firecrawl
+- Sentiment analysis
+- Keyword extraction
+- Flexible search capabilities
+
+## Prerequisites
+- Python 3.9+
+- Firecrawl API Key
+
+## Setup
+1. Clone the repository
+2. Install dependencies: `pip install -r requirements.txt`
+3. Create a `.env` file with your API keys:
+   ```
+   FIRECRAWL_API_KEY=your_key_here
+   NEWSAPI_KEY=your_key_here
+   ```
+4. Run the application: `python main.py`
+
+## Usage
+```bash
+python main.py --query "AI Technology" --from-date 2024-01-01 --to-date 2024-02-01
+```
+
+## Contributing
+Contributions are welcome! Please read our contributing guidelines.
+
+## License
+MIT License
\ No newline at end of file
diff --git a/examples/firecrawl_news_analysis/config.py b/examples/firecrawl_news_analysis/config.py
new file mode 100644
index 0000000..21a33d8
--- /dev/null
+++ b/examples/firecrawl_news_analysis/config.py
@@ -0,0 +1,36 @@
+import os
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# API Configuration
+API_CONFIG = {
+    'firecrawl': {
+        'api_key': os.getenv('FIRECRAWL_API_KEY'),
+        'base_url': 'https://api.firecrawl.dev/v0'
+    },
+    'newsapi': {
+        'api_key': os.getenv('NEWSAPI_KEY'),
+        'base_url': 'https://newsapi.org/v2'
+    },
+    'nyt': {
+        'api_key': os.getenv('NYT_API_KEY'),
+        'base_url': 'https://api.nytimes.com/svc/search/v2'
+    }
+}
+
+# Default search parameters
+DEFAULT_SEARCH_PARAMS = {
+    'query': 'technology',
+    'from_date': '2024-01-01',
+    'to_date': '2024-02-01',
+    'language': 'en',
+    'sources': ['techcrunch', 'wired', 'the-verge']
+}
+
+# Logging configuration
+LOGGING_CONFIG = {
+    'level': 'INFO',
+    'format': '%(asctime)s - %(levelname)s: %(message)s'
+}
\ No newline at end of file
diff --git a/examples/firecrawl_news_analysis/main.py b/examples/firecrawl_news_analysis/main.py
new file mode 100644
index 0000000..fab2e6f
--- /dev/null
+++ b/examples/firecrawl_news_analysis/main.py
@@ -0,0 +1,90 @@
+import argparse
+import logging
+from typing import List, Dict
+
+from config import DEFAULT_SEARCH_PARAMS, LOGGING_CONFIG
+from utils.helpers import validate_date
+from api_clients.firecrawl_client import FirecrawlNewsClient
+from data_processing.analyzer import NewsAnalyzer
+
+def setup_logging():
+    """Configure logging for the application."""
+    logging.basicConfig(
+        level=LOGGING_CONFIG['level'],
+        format=LOGGING_CONFIG['format']
+    )
+
+def parse_arguments():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description='Firecrawl News Analysis Tool')
+    parser.add_argument('--query', default=DEFAULT_SEARCH_PARAMS['query'], 
+                        help='Search query for news articles')
+    parser.add_argument('--from-date', default=DEFAULT_SEARCH_PARAMS['from_date'], 
+                        help='Start date for news search (YYYY-MM-DD)')
+    parser.add_argument('--to-date', default=DEFAULT_SEARCH_PARAMS['to_date'], 
+                        help='End date for news search (YYYY-MM-DD)')
+    
+    return parser.parse_args()
+
+def main():
+    """
+    Main application workflow for news analysis.
+    
+    Steps:
+    1. Parse arguments
+    2. Validate dates
+    3. Fetch news articles using Firecrawl
+    4. Perform analysis
+    5. Display results
+    """
+    setup_logging()
+    logger = logging.getLogger(__name__)
+    
+    try:
+        # Parse command-line arguments
+        args = parse_arguments()
+        
+        # Validate and standardize dates
+        from_date = validate_date(args.from_date)
+        to_date = validate_date(args.to_date)
+        
+        # Initialize Firecrawl client
+        firecrawl_client = FirecrawlNewsClient()
+        
+        # Fetch news articles
+        logger.info(f"Searching news for query: {args.query}")
+        news_articles = firecrawl_client.search_news(
+            query=args.query, 
+            from_date=from_date, 
+            to_date=to_date
+        )
+        
+        # Perform news analysis
+        analyzer = NewsAnalyzer(news_articles)
+        analysis_results = {
+            'total_articles': len(news_articles),
+            'sentiment_summary': analyzer.analyze_sentiment(),
+            'top_keywords': analyzer.extract_top_keywords(),
+            'source_distribution': analyzer.analyze_sources()
+        }
+        
+        # Display results
+        print("\n--- News Analysis Results ---")
+        print(f"Total Articles: {analysis_results['total_articles']}")
+        print("\nSentiment Summary:")
+        for sentiment, count in analysis_results['sentiment_summary'].items():
+            print(f"{sentiment.capitalize()}: {count}")
+        
+        print("\nTop Keywords:")
+        for keyword, freq in analysis_results['top_keywords'].items():
+            print(f"{keyword}: {freq}")
+        
+        print("\nNews Sources:")
+        for source, count in analysis_results['source_distribution'].items():
+            print(f"{source}: {count}")
+    
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/firecrawl_news_analysis/requirements.txt b/examples/firecrawl_news_analysis/requirements.txt
new file mode 100644
index 0000000..d479b23
--- /dev/null
+++ b/examples/firecrawl_news_analysis/requirements.txt
@@ -0,0 +1,15 @@
+# Core Dependencies
+firecrawl-py==1.9.0
+requests==2.31.0
+python-dotenv==1.0.0
+
+# Data Processing
+pandas==2.1.4
+textblob==0.17.1
+
+# Optional: Advanced NLP
+spacy==3.7.4
+nltk==3.8.1
+
+# Testing
+pytest==7.4.4
\ No newline at end of file
diff --git a/examples/firecrawl_news_analysis/utils/helpers.py b/examples/firecrawl_news_analysis/utils/helpers.py
new file mode 100644
index 0000000..e7bd68f
--- /dev/null
+++ b/examples/firecrawl_news_analysis/utils/helpers.py
@@ -0,0 +1,75 @@
+import re
+from datetime import datetime, timedelta
+from typing import Dict, Any
+
+def validate_date(date_str: str) -> str:
+    """
+    Validate and standardize date input.
+    
+    Args:
+        date_str (str): Input date string
+    
+    Returns:
+        str: Validated date in YYYY-MM-DD format
+    """
+    try:
+        # Try parsing the date in various formats
+        for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d-%m-%Y']:
+            try:
+                parsed_date = datetime.strptime(date_str, fmt)
+                return parsed_date.strftime('%Y-%m-%d')
+            except ValueError:
+                continue
+        
+        # If no format matches, raise ValueError
+        raise ValueError(f"Invalid date format: {date_str}")
+    
+    except Exception as e:
+        print(f"Date validation error: {e}")
+        # Default to current date if validation fails
+        return datetime.now().strftime('%Y-%m-%d')
+
+def clean_text(text: str) -> str:
+    """
+    Clean and normalize text.
+    
+    Args:
+        text (str): Input text to clean
+    
+    Returns:
+        str: Cleaned text
+    """
+    if not text:
+        return ""
+    
+    # Remove special characters and extra whitespaces
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    
+    return text.lower()
+
+def extract_keywords(text: str, top_n: int = 10) -> Dict[str, Any]:
+    """
+    Extract top keywords from text.
+    
+    Args:
+        text (str): Input text
+        top_n (int): Number of top keywords to return
+    
+    Returns:
+        Dict: Dictionary of top keywords with their frequencies
+    """
+    from collections import Counter
+    import spacy
+
+    try:
+        nlp = spacy.load('en_core_web_sm')
+        doc = nlp(text)
+        
+        # Filter for nouns and proper nouns
+        keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
+        
+        return dict(Counter(keywords).most_common(top_n))
+    except Exception as e:
+        print(f"Keyword extraction error: {e}")
+        return {}
\ No newline at end of file