diff --git a/README.md b/README.md index 9a1bc91..b3ff1ef 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# fscraper +# FSCRAPER Financial Data Scraper ## Introduction -The project contains a collection of functions used to scrape financial data from the internet, mainly in Japan, and to calculate financial indicators such as *RSI*, *beta*, *MACD*, etc. Web scraping is implemented using `BeautifulSoup` and `requests` for the site that provides a RESTful API endpoint. +The project contains a collection of functions used to scrape financial data, together with financial indicators calculator such as *RSI*, *beta*, *MACD*, etc. Web scraping is implemented using `BeautifulSoup` and `requests` for the site that provided RESTful API endpoint. ## Getting Started ### Installation @@ -37,6 +37,10 @@ df = ks.get_target_price() # Kabutan kbs = fs.KabutanScraper('7203.T') df = kbs.get_stock_price_by_minutes() + +# Minkabu +ms = fs.MinkabuScraper('7203.T') +df = ms.get_analysis() ``` ### Indicator diff --git a/fscraper/__init__.py b/fscraper/__init__.py index 5957a99..35d975b 100644 --- a/fscraper/__init__.py +++ b/fscraper/__init__.py @@ -2,7 +2,7 @@ from .kabuyohoscraper import KabuyohoScraper from .kabutanscraper import KabutanScraper from .reuterscraper import ReutersScraper - +from .minkabuscraper import MinkabuScraper from .utils import ( calculate_pearson_correlation, diff --git a/fscraper/kabutanscraper.py b/fscraper/kabutanscraper.py index c82bb1c..38f61e0 100644 --- a/fscraper/kabutanscraper.py +++ b/fscraper/kabutanscraper.py @@ -16,10 +16,7 @@ def __init__(self, code): self.code = code.upper().replace('.T', '') def get_stock_price_by_minutes(self): - r""" - Description: - Get stock price by minute - """ + """Get stock price by minute""" url = "https://kabutan.jp/stock/read?c={}&m=4&k=1&{}=".format(self.code, int(time.time() * 1000)) html = requests.get(url=url, headers=scraper_headers).text csvStringIO = StringIO(html) diff --git a/fscraper/kabuyohoscraper.py b/fscraper/kabuyohoscraper.py index 40d6aef..f2df30f 100644 --- a/fscraper/kabuyohoscraper.py +++ b/fscraper/kabuyohoscraper.py @@ -42,12 +42,15 @@ def __get_raw_data__(dom, xpath): raw_data = raw_data.replace(unicode, '') return raw_data + @classmethod def __scrape_report_target(cls, url): - r""" - Description: - Scrape the specific url - Return: - A dom Object(etree.ElementTree) + """Scrape the specific url + + Args: + url(str): kabuyoho url + + Returns: + etree.ElementTree: dom object """ scraper_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0', @@ -106,9 +109,10 @@ def get_report_top(self): return df.transpose() def get_report_target(self): - r""" - Description: - Get report from '/sp/reportTarget'. + """Get report from '/sp/reportTarget' + + Returns: + pd.DataFrame: kabuyoho report page info """ dom = self.__get_report_target_dom() @@ -181,11 +185,10 @@ def get_report_target(self): return df.transpose() def get_target_price(self): - r""" - Description: - Get theory PB/R and PE/R market price from sbisec API.("https://img-sec.ifis.co.jp") - The fastest way. - Return: DataFrame + """Get theory PB/R and PE/R market price from sbisec API.("https://img-sec.ifis.co.jp") + + Returns: + pd.DataFrame: target price """ # `Request` without `Referer`` paramter will be blocked by the website. scraper_headers = { diff --git a/fscraper/minkabuscraper.py b/fscraper/minkabuscraper.py new file mode 100644 index 0000000..9c30071 --- /dev/null +++ b/fscraper/minkabuscraper.py @@ -0,0 +1,55 @@ +import requests +import pandas as pd + + +class MinkabuScraper: + """Minkabu Scraper from https://minkabu.jp/ + + Attributes: + code(str): ticker symbol + """ + + def __init__(self, code: str): + self.code = code.replace('.T', '') + + def get_analysis(self): + """Get Minkabu analysis data from https://minkabu.jp/stock/code/analysis + + Returns: + pd.DataFrame: Analysis data including target price, theoretic_price and news, etc. + """ + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': 'https://minkabu.jp/', + 'ContentType': 'application/json', + 'X-Requested-With': 'XMLHttpRequest', + 'Origin': 'https://minkabu.jp', + 'Connection': 'keep-alive', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-site', + } + + url = 'https://assets.minkabu.jp/jsons/stock-jam/stocks/{code}/lump.json'.format( + code=self.code) + + raw_json = requests.get(url, headers=headers).json() + + df = pd.DataFrame() + df['date'] = pd.to_datetime(raw_json['dates']) + df['close'] = pd.to_numeric(raw_json['stock']['closes']) + df['target_price'] = pd.to_numeric(raw_json['stock']['mk_prices']) + df['predict_price'] = pd.to_numeric(raw_json['stock']['picks_prices']) + df['theoretical_price'] = pd.to_numeric( + raw_json['stock']['theoretic_prices']) + df['volume'] = pd.to_numeric(raw_json['stock']['volumes']) + + df['news'] = raw_json['stock']['news'] + df['picks'] = raw_json['stock']['picks'] + df['n225'] = pd.to_numeric(raw_json['n225']['closes']) + df['usdjpy'] = pd.to_numeric(raw_json['usdjpy']['closes']) + + df = df.set_index('date') + return df diff --git a/fscraper/reuterscraper.py b/fscraper/reuterscraper.py index 4153c00..bd8ffde 100644 --- a/fscraper/reuterscraper.py +++ b/fscraper/reuterscraper.py @@ -4,7 +4,7 @@ class ReutersScraper(object): - r""" + """ JSON api interface('https://jp.reuters.com/companies/api/') """ @@ -53,38 +53,38 @@ def get_event(self): # Data factory def get_income_statement(self, period='annual'): - r""" - Description: - Generate income statement - Parameters: - period: str('annual' or 'interim') - Return: - pd.DataFrame + """Retrieve income statement + + Args: + period(str): 'annual' or 'interim' income statement + + Returns: + pd.DataFrame: income statement """ df = self.__extract_financial_statement('income', period) return df def get_balance_sheet(self, period='annual'): - r""" - Description: - Generate balance sheet - Parameters: - period: str(`annual` or `interim`) - Return: - pd.DataFrame + """Retrieve balance sheet + + Args: + period(str): 'annual' or 'interim' blance sheet + + Returns: + pd.DataFrame: balance sheet """ df = self.__extract_financial_statement('balance_sheet', period) return df def get_cash_flow(self, period='annual'): - r""" - Description: - Get cash flow - Parameters: - period: str(`annual` or `interim`) - Return: - pd.DataFrame + """Get cash flow + + Args: + period(str): 'annual' or 'interim' cash flow + + Returns: + pd.DataFrame: cash flow """ df = self.__extract_financial_statement('cash_flow', period) return df diff --git a/fscraper/utils.py b/fscraper/utils.py index a2a298f..2ae2872 100644 --- a/fscraper/utils.py +++ b/fscraper/utils.py @@ -4,12 +4,16 @@ def calculate_pearson_correlation(price1: pd.Series, price2: pd.Series): - r""" - Description: - Calculate the Pearson Correlation with the two given prices. - Return: - corr: float64 - Example: + """Calculate the Pearson Correlation with the two given prices. + + Args: + price1(pd.Series): the first price for calculation + price2(pd.Series): the second price for calculation + + Returns: + float64: correlation + + Usage: `cor = calculate_pearson_correlation(df1['close'], df2['close'])` """ x = price1.to_numpy() @@ -18,15 +22,16 @@ def calculate_pearson_correlation(price1: pd.Series, price2: pd.Series): def calculate_beta(code: str, market: str = '^N225', period: str = '1y'): - r""" - Description: - Calculate the 'beta' with the given ticker code with the specific period using Yahoo Finance API. - Parameters: - code: str (e.g. '7203.T') - period: str (e.g. '1d', '1mo'...) - Return: - beta: float64 - Example: + """Calculate the 'beta' with the given ticker code with the specific period using Yahoo Finance API. + + Args: + code(str): ticker symbol(e.g. '7203.T') + period(str): beta of period (e.g. '1d', '1mo'...) + + Returns: + float64: beta + + Usage: `beta = calculate_beta('6753.T', '1y')` """ stock1 = YahooFinanceScraper(code) @@ -48,17 +53,20 @@ def calculate_beta(code: str, market: str = '^N225', period: str = '1y'): return cov/var -def calculate_rsi(ser: pd.Series, periods: int = 14): - r""" - Description: - Calculate RSI(Relative Strength Index) for the given price. - Return: - rsi: pd.Series +def calculate_rsi(price: pd.Series, periods: int = 14): + """Calculate RSI(Relative Strength Index) for the given price. + Note: * Greater than 80: overbought, less than 20: oversold. + + Args: + price(pd.Series): stock price + + Returns: + pd.Series: rsi for the date """ # Get up&down moves - price_delta = ser.diff(1) + price_delta = price.diff(1) # Extract up&down moves amount up = price_delta.clip(lower=0) @@ -76,15 +84,21 @@ def calculate_rsi(ser: pd.Series, periods: int = 14): def calculate_stochastic_oscillator(high: pd.Series, low: pd.Series, close: pd.Series, k_period: int = 14, d_period: int = 3): - r""" - Description: - Calculate Stochastic Oscillator Index('%K' and '%D') for the given price(Dataframe) - Return: - df: Dataframe(with 2 more columns'%K' and '%D') - Usage: + """Calculate Stochastic Oscillator Index('%K' and '%D') for the given price(Dataframe) + + Note: * 80: overbought, 20: oversold * '%K' crossing below '%D': sell * '%K' crossing above '%D': buy + + Args: + high(pd.Series): stock high price + low(pd.Series): stock low price + k_period(int): fast stochastic indicator + d_period(int): slow stochastic indicator + + Returns: + pd.Dataframe: input dataframe with 2 more columns'%K' and '%D' """ # Maximum value of previous 14 periods k_high = high.rolling(k_period).max() @@ -100,16 +114,21 @@ def calculate_stochastic_oscillator(high: pd.Series, low: pd.Series, close: pd.S def calculate_bollinger_bands(close: pd.Series, smooth_period: int = 20, standard_deviation: int = 2): - r""" - Description: - Calculate Bollinger Band for the given stock price. - Return: - df: Dataframe(with 2 more columns 'top' and 'bottom') + """Calculate Bollinger Band for the given stock price. + Note: * Breakouts provide no clue as to the direction and extent of future price movement. * 65% : standard_deviation = 1 * 95% : standard_deviation = 2 * 99% : standard_deviation = 3 + + Args: + close(pd.Series): close price + smooth_period(int): simple moving average(SMA) period + standard_deviation(int): standard deviation over last n period + + Returns: + pd.Dataframe: input dataframe with 2 more columns 'top' and 'bottom' """ sma = close.rolling(smooth_period).mean() std = close.rolling(smooth_period).std() @@ -121,13 +140,23 @@ def calculate_bollinger_bands(close: pd.Series, smooth_period: int = 20, standar def calculate_macd(close: pd.Series, short_periods: int = 12, long_periods: int = 26, signal_periods: int = 9): - r""" - Description: - Calculate MACD(Moving Average Convergence/Divergence) using 'close' price. + """Calculate MACD(Moving Average Convergence/Divergence) using 'close' price. + Note: * MACD Line > Signal Line -> Buy * MACD Line < Signal Line -> Sell * 'macd_histogram' around 0 indicates a change in trend may occur. + + Args: + close(pd.Series): close price + short_periods(int): the short-term exponential moving averages (EMAs) + long_periods(int): the long-term exponential moving averages (EMAs) + signal_periods(int): n-period EMA of the MACD line + + Returns: + pd.Series: macd + pd.Series: macd signal + pd.Series: macd histogram """ # Get the 12-day EMA of the closing price short_ema = close.ewm(span=short_periods, adjust=False, @@ -150,9 +179,17 @@ def calculate_macd(close: pd.Series, short_periods: int = 12, long_periods: int def set_x_days_high_low(high: pd.Series, low: pd.Series, window: int): - r""" - Description: - Set x days high/low price. + """Set x days high/low price. + + Args: + high(pd.Series): high price + low(pd.Series): low price + window(int): window length for high and low price + + Returns: + pd.Series: highest price for the window + pd.Series: lowest price for the window + Usage: `df['3-day-high'], df['3-day-low'] = set_x_days_high_low(df['high'], df['low'], window=3)` """ @@ -160,9 +197,15 @@ def set_x_days_high_low(high: pd.Series, low: pd.Series, window: int): def calculate_obv(close: pd.Series, volume: pd.Series): - r""" - Description: - On Balance Volume (OBV) + """On Balance Volume (OBV) + + Args: + close(pd.Series): close price + volume(pd.Series): day's volume + + Returns: + pd.Series: OBV + Usage: `df['OBV'] = fs.calculate_obv(df['close'], df['volume'])` """ diff --git a/fscraper/yfscraper.py b/fscraper/yfscraper.py index 958cf5b..8cab33f 100644 --- a/fscraper/yfscraper.py +++ b/fscraper/yfscraper.py @@ -56,12 +56,14 @@ def get_statistics(self): return df.transpose() def get_stock_price(self, period: str = '1mo', interval: str = '1d'): - r""" - Description: - Get historical price - Parameters: - period : str(`1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max`) - interval : str(`1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo`) + """Get historical price + + Args: + period(str): `1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max` + interval(str): `1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo` + + Returns: + pd.DataFrame: stock price """ params = dict() params['range'] = period @@ -73,13 +75,15 @@ def get_stock_price(self, period: str = '1mo', interval: str = '1d'): return df def get_stock_price2(self, start: str = '', end: str = date.today().strftime('%Y-%m-%d'), interval: str = '1d'): - r""" - Description: - Get history price with specified date. - Parameters: - start: str(`yyyy-mm-dd`) - end: str(`yyyy-mm-dd`) - interval : str(`1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo`) + """Get history price with specified date. + + Args: + start(str): start date, format `yyyy-mm-dd` + end(str): end date, format `yyyy-mm-dd` + interval(str): `1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo` + + Returns: + pd.DataFrame: stock price """ params = dict() params['period1'] = int(datetime.strptime( @@ -110,7 +114,7 @@ def __construct_price_dataframe(self, params): # Add dividends if exists. try: - for key, item in price_json['chart']['result'][0]['events']['dividends'].items(): + for _, item in price_json['chart']['result'][0]['events']['dividends'].items(): df.loc[df['date'] == item['date'], 'dividends'] = item['amount'] except KeyError as e: diff --git a/setup.py b/setup.py index edf5ff6..f6e3350 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='fscraper', - version='1.0.3', + version='1.0.4', description='Financial Data Web Scraper', long_description=long_description, long_description_content_type='text/markdown',