Skip to content

Commit

Permalink
Merge pull request #1 from er-ri/develop
Browse files Browse the repository at this point in the history
Web scraper for minkabu
  • Loading branch information
er-ri authored Dec 16, 2023
2 parents 5532e03 + 4b847e6 commit 48f9155
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 99 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# fscraper
# FSCRAPER
Financial Data Scraper

## Introduction
The project contains a collection of functions used to scrape financial data from the internet, mainly in Japan, and to calculate financial indicators such as *RSI*, *beta*, *MACD*, etc. Web scraping is implemented using `BeautifulSoup` and `requests` for the site that provides a RESTful API endpoint.
The project contains a collection of functions used to scrape financial data, together with financial indicators calculator such as *RSI*, *beta*, *MACD*, etc. Web scraping is implemented using `BeautifulSoup` and `requests` for the site that provided RESTful API endpoint.

## Getting Started
### Installation
Expand Down Expand Up @@ -37,6 +37,10 @@ df = ks.get_target_price()
# Kabutan
kbs = fs.KabutanScraper('7203.T')
df = kbs.get_stock_price_by_minutes()

# Minkabu
ms = fs.MinkabuScraper('7203.T')
df = ms.get_analysis()
```

### Indicator
Expand Down
2 changes: 1 addition & 1 deletion fscraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .kabuyohoscraper import KabuyohoScraper
from .kabutanscraper import KabutanScraper
from .reuterscraper import ReutersScraper

from .minkabuscraper import MinkabuScraper

from .utils import (
calculate_pearson_correlation,
Expand Down
5 changes: 1 addition & 4 deletions fscraper/kabutanscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@ def __init__(self, code):
self.code = code.upper().replace('.T', '')

def get_stock_price_by_minutes(self):
r"""
Description:
Get stock price by minute
"""
"""Get stock price by minute"""
url = "https://kabutan.jp/stock/read?c={}&m=4&k=1&{}=".format(self.code, int(time.time() * 1000))
html = requests.get(url=url, headers=scraper_headers).text
csvStringIO = StringIO(html)
Expand Down
29 changes: 16 additions & 13 deletions fscraper/kabuyohoscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,15 @@ def __get_raw_data__(dom, xpath):
raw_data = raw_data.replace(unicode, '')
return raw_data

@classmethod
def __scrape_report_target(cls, url):
r"""
Description:
Scrape the specific url
Return:
A dom Object(etree.ElementTree)
"""Scrape the specific url
Args:
url(str): kabuyoho url
Returns:
etree.ElementTree: dom object
"""
scraper_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
Expand Down Expand Up @@ -106,9 +109,10 @@ def get_report_top(self):
return df.transpose()

def get_report_target(self):
r"""
Description:
Get report from '/sp/reportTarget'.
"""Get report from '/sp/reportTarget'
Returns:
pd.DataFrame: kabuyoho report page info
"""
dom = self.__get_report_target_dom()

Expand Down Expand Up @@ -181,11 +185,10 @@ def get_report_target(self):
return df.transpose()

def get_target_price(self):
r"""
Description:
Get theory PB/R and PE/R market price from sbisec API.("https://img-sec.ifis.co.jp")
The fastest way.
Return: DataFrame
"""Get theory PB/R and PE/R market price from sbisec API.("https://img-sec.ifis.co.jp")
Returns:
pd.DataFrame: target price
"""
# `Request` without `Referer`` paramter will be blocked by the website.
scraper_headers = {
Expand Down
55 changes: 55 additions & 0 deletions fscraper/minkabuscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import requests
import pandas as pd


class MinkabuScraper:
"""Minkabu Scraper from https://minkabu.jp/
Attributes:
code(str): ticker symbol
"""

def __init__(self, code: str):
self.code = code.replace('.T', '')

def get_analysis(self):
"""Get Minkabu analysis data from https://minkabu.jp/stock/code/analysis
Returns:
pd.DataFrame: Analysis data including target price, theoretic_price and news, etc.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://minkabu.jp/',
'ContentType': 'application/json',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'https://minkabu.jp',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
}

url = 'https://assets.minkabu.jp/jsons/stock-jam/stocks/{code}/lump.json'.format(
code=self.code)

raw_json = requests.get(url, headers=headers).json()

df = pd.DataFrame()
df['date'] = pd.to_datetime(raw_json['dates'])
df['close'] = pd.to_numeric(raw_json['stock']['closes'])
df['target_price'] = pd.to_numeric(raw_json['stock']['mk_prices'])
df['predict_price'] = pd.to_numeric(raw_json['stock']['picks_prices'])
df['theoretical_price'] = pd.to_numeric(
raw_json['stock']['theoretic_prices'])
df['volume'] = pd.to_numeric(raw_json['stock']['volumes'])

df['news'] = raw_json['stock']['news']
df['picks'] = raw_json['stock']['picks']
df['n225'] = pd.to_numeric(raw_json['n225']['closes'])
df['usdjpy'] = pd.to_numeric(raw_json['usdjpy']['closes'])

df = df.set_index('date')
return df
44 changes: 22 additions & 22 deletions fscraper/reuterscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class ReutersScraper(object):
r"""
"""
JSON api interface('https://jp.reuters.com/companies/api/')
"""

Expand Down Expand Up @@ -53,38 +53,38 @@ def get_event(self):

# Data factory
def get_income_statement(self, period='annual'):
r"""
Description:
Generate income statement
Parameters:
period: str('annual' or 'interim')
Return:
pd.DataFrame
"""Retrieve income statement
Args:
period(str): 'annual' or 'interim' income statement
Returns:
pd.DataFrame: income statement
"""
df = self.__extract_financial_statement('income', period)

return df

def get_balance_sheet(self, period='annual'):
r"""
Description:
Generate balance sheet
Parameters:
period: str(`annual` or `interim`)
Return:
pd.DataFrame
"""Retrieve balance sheet
Args:
period(str): 'annual' or 'interim' blance sheet
Returns:
pd.DataFrame: balance sheet
"""
df = self.__extract_financial_statement('balance_sheet', period)
return df

def get_cash_flow(self, period='annual'):
r"""
Description:
Get cash flow
Parameters:
period: str(`annual` or `interim`)
Return:
pd.DataFrame
"""Get cash flow
Args:
period(str): 'annual' or 'interim' cash flow
Returns:
pd.DataFrame: cash flow
"""
df = self.__extract_financial_statement('cash_flow', period)
return df
Expand Down
Loading

0 comments on commit 48f9155

Please sign in to comment.