Merge pull request #1 from er-ri/develop

Web scraper for minkabu
er-ri · Dec 16, 2023 · 48f9155 · 48f9155
2 parents 5532e03 + 4b847e6
commit 48f9155
Show file tree

Hide file tree

Showing 9 changed files with 205 additions and 99 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-# fscraper
+# FSCRAPER
 Financial Data Scraper
 
 ## Introduction
-The project contains a collection of functions used to scrape financial data from the internet, mainly in Japan, and to calculate financial indicators such as *RSI*, *beta*, *MACD*, etc. Web scraping is implemented using `BeautifulSoup` and `requests` for the site that provides a RESTful API endpoint.
+The project contains a collection of functions used to scrape financial data, together with financial indicators calculator such as *RSI*, *beta*, *MACD*, etc. Web scraping is implemented using `BeautifulSoup` and `requests` for the site that provided RESTful API endpoint.
 
 ## Getting Started 
 ### Installation
@@ -37,6 +37,10 @@ df = ks.get_target_price()
 # Kabutan
 kbs = fs.KabutanScraper('7203.T')
 df = kbs.get_stock_price_by_minutes()
+
+# Minkabu
+ms = fs.MinkabuScraper('7203.T')
+df = ms.get_analysis()
 ```
 
 ### Indicator

diff --git a/fscraper/__init__.py b/fscraper/__init__.py
@@ -2,7 +2,7 @@
 from .kabuyohoscraper import KabuyohoScraper
 from .kabutanscraper import KabutanScraper
 from .reuterscraper import ReutersScraper
-
+from .minkabuscraper import MinkabuScraper
 
 from .utils import (
     calculate_pearson_correlation,

diff --git a/fscraper/kabutanscraper.py b/fscraper/kabutanscraper.py
@@ -16,10 +16,7 @@ def __init__(self, code):
         self.code = code.upper().replace('.T', '')
 
     def get_stock_price_by_minutes(self):
-        r"""
-        Description:
-            Get stock price by minute
-        """
+        """Get stock price by minute"""
         url = "https://kabutan.jp/stock/read?c={}&m=4&k=1&{}=".format(self.code, int(time.time() * 1000))
         html = requests.get(url=url, headers=scraper_headers).text
         csvStringIO = StringIO(html)

diff --git a/fscraper/kabuyohoscraper.py b/fscraper/kabuyohoscraper.py
@@ -42,12 +42,15 @@ def __get_raw_data__(dom, xpath):
             raw_data = raw_data.replace(unicode, '')
         return raw_data
 
+    @classmethod
     def __scrape_report_target(cls, url):
-        r"""
-        Description:
-            Scrape the specific url
-        Return:
-            A dom Object(etree.ElementTree)
+        """Scrape the specific url
+
+        Args:
+            url(str): kabuyoho url
+
+        Returns:
+            etree.ElementTree: dom object
         """
         scraper_headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
@@ -106,9 +109,10 @@ def get_report_top(self):
         return df.transpose()
 
     def get_report_target(self):
-        r"""
-        Description:
-            Get report from '/sp/reportTarget'.
+        """Get report from '/sp/reportTarget'
+
+        Returns:
+            pd.DataFrame: kabuyoho report page info
         """
         dom = self.__get_report_target_dom()
 
@@ -181,11 +185,10 @@ def get_report_target(self):
         return df.transpose()
 
     def get_target_price(self):
-        r"""
-        Description:
-            Get theory PB/R and PE/R market price from sbisec API.("https://img-sec.ifis.co.jp")
-            The fastest way.    
-        Return: DataFrame
+        """Get theory PB/R and PE/R market price from sbisec API.("https://img-sec.ifis.co.jp")
+
+        Returns: 
+            pd.DataFrame: target price
         """
         # `Request` without `Referer`` paramter will be blocked by the website.
         scraper_headers = {

diff --git a/fscraper/minkabuscraper.py b/fscraper/minkabuscraper.py
@@ -0,0 +1,55 @@
+import requests
+import pandas as pd
+
+
+class MinkabuScraper:
+    """Minkabu Scraper from https://minkabu.jp/
+
+    Attributes:
+        code(str): ticker symbol
+    """
+
+    def __init__(self, code: str):
+        self.code = code.replace('.T', '')
+
+    def get_analysis(self):
+        """Get Minkabu analysis data from https://minkabu.jp/stock/code/analysis
+
+        Returns:
+            pd.DataFrame: Analysis data including target price, theoretic_price and news, etc.
+        """
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
+            'Accept': 'application/json, text/plain, */*',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Referer': 'https://minkabu.jp/',
+            'ContentType': 'application/json',
+            'X-Requested-With': 'XMLHttpRequest',
+            'Origin': 'https://minkabu.jp',
+            'Connection': 'keep-alive',
+            'Sec-Fetch-Dest': 'empty',
+            'Sec-Fetch-Mode': 'cors',
+            'Sec-Fetch-Site': 'same-site',
+        }
+
+        url = 'https://assets.minkabu.jp/jsons/stock-jam/stocks/{code}/lump.json'.format(
+            code=self.code)
+
+        raw_json = requests.get(url, headers=headers).json()
+
+        df = pd.DataFrame()
+        df['date'] = pd.to_datetime(raw_json['dates'])
+        df['close'] = pd.to_numeric(raw_json['stock']['closes'])
+        df['target_price'] = pd.to_numeric(raw_json['stock']['mk_prices'])
+        df['predict_price'] = pd.to_numeric(raw_json['stock']['picks_prices'])
+        df['theoretical_price'] = pd.to_numeric(
+            raw_json['stock']['theoretic_prices'])
+        df['volume'] = pd.to_numeric(raw_json['stock']['volumes'])
+
+        df['news'] = raw_json['stock']['news']
+        df['picks'] = raw_json['stock']['picks']
+        df['n225'] = pd.to_numeric(raw_json['n225']['closes'])
+        df['usdjpy'] = pd.to_numeric(raw_json['usdjpy']['closes'])
+
+        df = df.set_index('date')
+        return df
diff --git a/fscraper/reuterscraper.py b/fscraper/reuterscraper.py
@@ -4,7 +4,7 @@
 
 
 class ReutersScraper(object):
-    r"""
+    """
     JSON api interface('https://jp.reuters.com/companies/api/')
     """
 
@@ -53,38 +53,38 @@ def get_event(self):
 
     # Data factory
     def get_income_statement(self, period='annual'):
-        r"""
-        Description:
-            Generate income statement
-        Parameters:
-            period: str('annual' or 'interim')
-        Return:
-            pd.DataFrame
+        """Retrieve income statement
+
+        Args:
+            period(str): 'annual' or 'interim' income statement
+
+        Returns:
+            pd.DataFrame: income statement
         """
         df = self.__extract_financial_statement('income', period)
 
         return df
 
     def get_balance_sheet(self, period='annual'):
-        r"""
-        Description:
-            Generate balance sheet
-        Parameters:
-            period: str(`annual` or `interim`)
-        Return:
-            pd.DataFrame
+        """Retrieve balance sheet
+
+        Args:
+            period(str): 'annual' or 'interim' blance sheet
+
+        Returns:
+            pd.DataFrame: balance sheet
         """
         df = self.__extract_financial_statement('balance_sheet', period)
         return df
 
     def get_cash_flow(self, period='annual'):
-        r"""
-        Description:
-            Get cash flow
-        Parameters:
-            period: str(`annual` or `interim`)
-        Return:
-            pd.DataFrame
+        """Get cash flow
+
+        Args:
+            period(str): 'annual' or 'interim' cash flow
+
+        Returns:
+            pd.DataFrame: cash flow
         """
         df = self.__extract_financial_statement('cash_flow', period)
         return df