forked from sadicelik/DrugBank-Web-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Sadi Çelik
committed
Nov 6, 2021
0 parents
commit 8e6220a
Showing
133 changed files
with
32,932 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from playwright.sync_api import sync_playwright | ||
|
||
with sync_playwright() as p: | ||
browser = p.webkit.launch() | ||
page = browser.new_page() | ||
|
||
for page_number in range(1,125): | ||
page.goto("https://go.drugbank.com/pharmaco/metabolomics?page={}".format(page_number)) | ||
page.wait_for_timeout(10000) | ||
# Change the file location in your device | ||
with open("pharmaco-metabolomics-data\pharmaco-metabolomics-page{}_markup.html".format(page_number), "w", encoding="utf-8") as file: | ||
file.write(page.content()) | ||
|
||
browser.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import pandas as pd | ||
from bs4 import BeautifulSoup | ||
|
||
def extract_text(soup_obj, tag, attribute_name, attribute_value): | ||
txt = soup_obj.find(tag, {attribute_name: attribute_value}).text.strip() if soup_obj.find(tag, {attribute_name: attribute_value}) else '' | ||
return txt | ||
|
||
rows = [] | ||
|
||
for page_number in range(1,125): | ||
# Change the file location in your device | ||
with open("pharmaco-metabolomics-data\pharmaco-metabolomics-page{}_markup.html".format(page_number), "r") as file: | ||
data = file.read() | ||
soup = BeautifulSoup(data, 'html.parser') | ||
drug_metabolite_table = soup.find('table', {'class': 'table-metabolite-regulations table table-bordered'}) | ||
drug_metabolite_table_body = soup.find("tbody") | ||
drug_metabolite_table_body_row = drug_metabolite_table_body.find_all("tr") | ||
|
||
for row in drug_metabolite_table_body_row: | ||
temp_data = row.find_all("td") | ||
drug = temp_data[0].text | ||
metabolite = temp_data[2].text | ||
change = temp_data[3].text.strip() | ||
if change == "increased": | ||
increased = 1 | ||
decreased = 0 | ||
elif change == "decreased": | ||
decreased = 1 | ||
increased = 0 | ||
else: | ||
decreased = 0 | ||
increased = 0 | ||
description = temp_data[4].text | ||
|
||
rows.append([drug,metabolite,change,increased,decreased,description]) | ||
|
||
|
||
# Create dataframe and export | ||
|
||
columns = ["Drug", "Metabolite", "Change", "Increased", "Decreased", "Description"] | ||
|
||
df = pd.DataFrame(data=rows, columns=columns) | ||
df.sort_values(by="Drug", ascending=True, inplace=True) | ||
df.to_csv('pharmaco-metabolomics-data.csv', index=False) | ||
df.to_excel('pharmaco-metabolomics-data.xlsx', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import cfscrape | ||
|
||
url = "https://go.drugbank.com/pharmaco/metabolomics" | ||
scraper = cfscrape.create_scraper(delay=15) # returns a CloudflareScraper instance | ||
|
||
# Or: scraper = cfscrape.CloudflareScraper() # CloudflareScraper inherits from requests.Session | ||
print(scraper.get(url).content) # => "<!DOCTYPE html><html><head>..." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import undetected_chromedriver as uc | ||
|
||
driver = uc.Chrome() | ||
with driver: | ||
driver.get('https://go.drugbank.com/pharmaco/metabolomics') # known url using cloudflare's "under attack mode" | ||
|
||
with open("page_markup_bypass_3.html", "w", encoding="utf-8") as file: | ||
file.write(driver.page_source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import cloudscraper | ||
|
||
# Or: scraper = cloudscraper.CloudScraper() # CloudScraper inherits from requests.Session | ||
|
||
url = "https://go.drugbank.com/pharmaco/metabolomics" | ||
|
||
scraper = cloudscraper.create_scraper(browser='chrome', debug=True) | ||
|
||
print(scraper.get(url).text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# DrugBank-Web Scraper | ||
|
||
import time | ||
import pandas as pd | ||
from selenium import webdriver | ||
from selenium.common.exceptions import TimeoutException | ||
from selenium.webdriver import Chrome | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from bs4 import BeautifulSoup | ||
|
||
# Initialize the driver | ||
chrome_driver_path = "D:\Development\Python\Web Scarping\WebDriver\chromedriver.exe" # Change to your local driver location | ||
|
||
# Options for CloudFlare bypass | ||
options = webdriver.ChromeOptions() | ||
options.add_experimental_option("excludeSwitches", ["enable-automation"]) | ||
options.add_experimental_option('useAutomationExtension', False) | ||
options.add_argument("--disable-blink-features=AutomationControlled") | ||
|
||
driver = webdriver.Chrome(options=options, executable_path=chrome_driver_path) | ||
|
||
# Delay time | ||
delay = 120 | ||
|
||
driver.get("https://go.drugbank.com/pharmaco/metabolomics") | ||
time.sleep(delay) | ||
|
||
with open("page_markup.html", "w", encoding="utf-8") as file: | ||
file.write(driver.page_source) | ||
|
||
# try: | ||
# WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'card-body'))) | ||
# except TimeoutException: | ||
# print('Loading exceeds delay time') | ||
# #break | ||
# else: | ||
# with open("page_markup.html", "w", encoding="utf-8") as file: | ||
# file.write(driver.page_source) | ||
# finally: | ||
# driver.quit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import pandas as pd | ||
|
||
page = requests.get("https://go.drugbank.com/pharmaco/metabolomics") | ||
soup = BeautifulSoup(page.content, 'html.parser') | ||
|
||
print(page) | ||
|
||
# with open("drug_demo.html", "w", encoding="utf-8") as file: | ||
# file.write(soup) |
Oops, something went wrong.