-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsvoutput.py
113 lines (93 loc) · 4.23 KB
/
csvoutput.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
from datetime import datetime
data = []
# Function to perform actions for different values of x
def process_element(x):
try:
# Click on the oferta_link element
oferta_link = wait.until(EC.element_to_be_clickable((By.XPATH, f'/html/body/section[2]/div/div/div[1]/div[{x}]/div/div/div[2]/div[1]/div[2]/a')))
oferta_link.click()
driver.refresh()
except Exception as e:
return
try:
# Click on the email_click element
email_click = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@class="spoiler-email font-bold underline cursor-pointer"]')))
email_click.click()
except Exception as e:
return driver.back()
try:
# Get the text content of the mail element and company name
mail = wait.until(EC.visibility_of_element_located((By.XPATH, '//span[@class="spoiler-email font-bold underline cursor-pointer"]')))
company = wait.until(EC.visibility_of_element_located((By.XPATH, '//h2[@class="text-lg font-bold text-gray-800"]')))
pub_date = wait.until(EC.visibility_of_element_located((By.XPATH, '//span[@class="text-sm text-gray-600"]')))
company_name = company.text
mail_text = mail.text
pub_date_text = pub_date.text
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(pub_date_text, company_name, mail_text, current_time)
data.append((pub_date_text, company_name, mail_text, current_time))
except Exception as e:
return driver.back()
try:
# Click on the go_back element
driver.back()
driver.refresh()
except Exception as e:
return print('go_back error')
try:
# Click on the go_back element
go_back = wait.until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/section/div/main/div/div[1]/a')))
go_back.click()
except:
return
# Specify the path to the ChromeDriver executable
chrome_driver_path = 'chromedriver.exe'
# Specify the path to the Adblock Plus CRX file
adblock_path = 'Adblock Plus CRX.crx'
# Create Chrome options
chrome_options = Options()
chrome_options.add_extension(adblock_path)
chrome_options.add_argument("--disable-javascript")
# Create a Service object with the path to the ChromeDriver executable
service = Service(chrome_driver_path)
# Create a WebDriver instance with the Service object and Chrome options
driver = webdriver.Chrome(service=service, options=chrome_options)
# The URL you want to open
website = 'https://empregomais.com/'
# Open the website
driver.get(website)
wait = WebDriverWait(driver, 1) # Increased wait time to 10 seconds
# Iterate over values of x
while True:
for x in range(1, 25):
process_element(x)
try:
next_page = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="next page-numbers"]')))
next_page.click()
except Exception as e:
print(f"An error occurred while clicking the next page button: {e}")
break # Exit the loop if there is an issue clicking the next page button
# Close the WebDriver
driver.quit()
# Create a DataFrame from the collected data
df_new = pd.DataFrame(data, columns=['Pub_date','Company','Email', 'timestamp'])
# Check if 'output.csv' exists and read it if it does
if os.path.exists('output.csv'):
df_existing = pd.read_csv('output.csv')
# Concatenate the new data with the existing data
df_combined = pd.concat([df_existing, df_new]).drop_duplicates(subset=['Email'])
else:
df_combined = df_new.drop_duplicates(subset=['Email'])
# Sort DataFrame by date column in descending order
df_combined.sort_values(by='timestamp', ascending = False, inplace = True)
# Export the combined DataFrame to CSV file
df_combined.to_csv('output.csv', index=False)
print("DataFrame has been appended to 'output.csv'")