forked from unicorn-io/Review-Bay
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
39 lines (24 loc) · 837 Bytes
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from bs4 import BeautifulSoup as bs
import urllib.request
import re
target-url='https://fms.nrsc.gov.in/fmsexternal/newuserh.php'
##to connect to the website
try:
page = urllib.request.urlopen(target_url)
except Exception as e:
print(e)
###add functionality for php also ...currently working for html only
soup = bs(page,'html.parser')
print(soup)
## using regex to find label
regex =re.compile('^pwd')
content_lbl = soup.find_all('label',attrs={'class':regex})
print(content_lbl)
## toget data from the font part
font_part =soup.find_all('font',attrs={'class':regex})
font_part_soup = font_part.fins_all('font')
print(font_part_soup)
## creating a file and writing into it as a .json file and can be converted into .csv
with open('content.json','w') as f:
for i in content:
f.write(i+"\n")