-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
115 lines (103 loc) · 5.65 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# 概要
# 歌詞検索サービス[Uta-Net](https://www.uta-net.com/)から各アーティストの歌詞をスクレイピングするコードです。
# ライブラリのインポート
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
# 保存先のパス
PATH = './data'
# アーティストの歌詞一覧へのリンク
# - `url_dict`に`{'歌手名': ['URL1', 'URL2', ...]}`の形式でリンクを手動で記述する。
# - 歌詞一覧が2ページ以上にまたがる場合はURLを複数記述することになる。
url_dict = {
'YOASOBI' : ['https://www.uta-net.com/artist/28370/'],
'スピッツ' : ['https://www.uta-net.com/artist/1475/0/1/', 'https://www.uta-net.com/artist/1475/0/2/'],
'Mr.Children': ['https://www.uta-net.com/artist/684/0/1/', 'https://www.uta-net.com/artist/684/0/2/'],
'米津玄師': ['https://www.uta-net.com/artist/12795/'],
'SEKAI NO OWARI': ['https://www.uta-net.com/artist/9699/'],
'あいみょん' : ['https://www.uta-net.com/artist/17598/'],
'King Gnu': ['https://www.uta-net.com/artist/23343/'],
'Mrs. GREEN APPLE': ['https://www.uta-net.com/artist/18526/'],
'Official髭男dism': ['https://www.uta-net.com/artist/18093/'],
'安室奈美恵': ['https://www.uta-net.com/artist/1822/'],
'back number': ['https://www.uta-net.com/artist/8613/'],
'ヨルシカ': ['https://www.uta-net.com/artist/22653/'],
'BUMP OF CHICKEN': ['https://www.uta-net.com/artist/126/'],
'ONE OK ROCK': ['https://www.uta-net.com/artist/7063/'],
'RADWIMPS': ['https://www.uta-net.com/artist/4082/'],
"B'z": ['https://www.uta-net.com/artist/134/0/1/', 'https://www.uta-net.com/artist/134/0/2/'],
'ゆず': ['https://www.uta-net.com/artist/1750/0/1/', 'https://www.uta-net.com/artist/1750/0/2/'],
'嵐': ['https://www.uta-net.com/artist/3891/0/1/', 'https://www.uta-net.com/artist/3891/0/2/'],
'GreeeeN': ['https://www.uta-net.com/artist/5384/'],
'サザンオールスターズ': ['https://www.uta-net.com/artist/1395/0/1/', 'https://www.uta-net.com/artist/1395/0/2/'],
'宇多田ヒカル': ['https://www.uta-net.com/artist/1892/'],
'星野源': ['https://www.uta-net.com/artist/9867/'],
'ポルノグラフィティ': ['https://www.uta-net.com/artist/1686/0/1/', 'https://www.uta-net.com/artist/1686/0/2/'],
'Eve': ['https://www.uta-net.com/artist/20987/'],
'Ado': ['https://www.uta-net.com/artist/29298/'],
'BTS': ['https://www.uta-net.com/artist/16377/'],
'中島みゆき': ['https://www.uta-net.com/artist/3315/0/1/', 'https://www.uta-net.com/artist/3315/0/2/', 'https://www.uta-net.com/artist/3315/0/3/'],
'緑黄色社会': ['https://www.uta-net.com/artist/22823/'],
'倖田來未': ['https://www.uta-net.com/artist/2261/0/1/', 'https://www.uta-net.com/artist/2261/0/2/'],
'優里': ['https://www.uta-net.com/artist/28773/'],
'いきものがかり': ['https://www.uta-net.com/artist/5580/'],
'ZARD': ['https://www.uta-net.com/artist/1155/'],
'椎名林檎': ['https://www.uta-net.com/artist/3361/'],
'小田和正': ['https://www.uta-net.com/artist/2673/'],
'TWICE': ['https://www.uta-net.com/artist/21906/'],
'Perfume': ['https://www.uta-net.com/artist/5555/'],
'Uru': ['https://www.uta-net.com/artist/20238/'],
'Superfly': ['https://www.uta-net.com/artist/6895/'],
'aiko': ['https://www.uta-net.com/artist/39/'],
'Aimer': ['https://www.uta-net.com/artist/11629/'],
'三代目 J SOUL BROTHERS': ['https://www.uta-net.com/artist/10539/']
}
# 歌詞を取得
# - 上に記載したリンク先のHTMLを取得し、歌詞を取得する。
#「xxxの歌詞一覧」のHTMLを取得
html_dict = dict()
for artist,urls in tqdm(url_dict.items()):
html_dict[artist] = []
for url in urls:
res = requests.get(url)
html_dict[artist].append(BeautifulSoup(res.content, 'html.parser'))
# 歌詞を取得
## <テキストの処理方法>
## 全角スペース(\u3000, Webページ上では改行に見える)は「*」に置換
## 上の処理後、「****」は「**」に置換
artist_arr = []
title_arr = []
lyric_arr = []
for artist, htmls in html_dict.items():
for html in htmls:
for table in html.find_all('tbody', class_='songlist-table-body'):
for tr in table.find_all('tr'):
title = tr.find('span', class_='fw-bold songlist-title pb-1 pb-lg-0').text
lyric = tr.find('span', class_='d-block pc-utaidashi').text
lyric = lyric.replace('\u3000','*').replace('****','**')
artist_arr.append(artist)
title_arr.append(title)
lyric_arr.append(lyric)
artist_arr = np.array(artist_arr)
title_arr = np.array(title_arr)
lyric_arr = np.array(lyric_arr)
# CSVを作成し、保存
## lyric_all.csv : 1行=1曲の歌詞全体
## lyric_block.csv : 1行=1ブロック分の歌詞(**で分割)
df_all = pd.DataFrame({'artist':artist_arr, 'title':title_arr, 'lyric':lyric_arr})
df_all.to_csv(os.path.join(PATH,'lyric_all.csv'), index=False)
print('Saved: lyric_all.csv', df_all.shape)
artist_block_arr = []
title_block_arr = []
block_arr = []
for artist,title,lyric in zip(artist_arr, title_arr, lyric_arr):
for block in lyric.split('**'):
artist_block_arr.append(artist)
title_block_arr.append(title)
block_arr.append(block)
df_block = pd.DataFrame({'artist':artist_block_arr, 'title':title_block_arr, 'block':block_arr})
df_block.to_csv(os.path.join(PATH,'lyric_block.csv'), index=False)
print('Saved: lyric_block.csv', df_block.shape)