-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGoogle_Alert_Email_Digest.py
More file actions
151 lines (131 loc) · 5.95 KB
/
Google_Alert_Email_Digest.py
File metadata and controls
151 lines (131 loc) · 5.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import imaplib
import email
import email.utils
from bs4 import BeautifulSoup
import re
from datetime import datetime
def extract_detailed_job_details(email_content, received_date):
# Parsing the email's HTML body using BeautifulSoup
soup = BeautifulSoup(email_content, 'html.parser')
jobs_details = []
job_elements = soup.find_all('a', href=True)
for elem in job_elements:
# Check for "+ 1 Filter" and skip if found
if "+ 1 Filter" in elem.text:
continue
if "jobs" in elem.text:
continue
# Extracting job link
job_link = elem["href"]
# Extracting job title
title_elem = elem.find('span', style=re.compile(r"font-size:.*?;"))
title = title_elem.text if title_elem else None
# Extracting company name
company_elem = elem.find('div', style=re.compile(r"color: black;"))
company = company_elem.text if company_elem else None
# Extracting job location and keeping only the city
# location_elem = elem.find('div', style=re.compile(r"color: #8A8A8A;"))
# location = location_elem.text.split(",")[0] if location_elem else None
location_elem = elem.find('div', style=re.compile(r"color: #8A8A8A;"))
if location_elem:
location_elems = location_elem.text.split(",")
if len(location_elems) == 3:
location = location_elems[1].strip() # Take the city from "Postal Code, City, Country"
elif len(location_elems) == 2:
location = location_elems[0].strip() # Take the city from "City, Country"
else:
location = None
else:
location = None
# Extracting job date and type and adjusting the date format
date_type_elems = elem.find_all('span', style=re.compile(r"color: #8A8A8A;"))
date = None
if date_type_elems:
raw_date = date_type_elems[0].text
date_match = re.match(r'(\d+\. \w+\.).*', raw_date)
if date_match:
day, month = date_match.group(1).split('. ')
month_map = {
"Jan.": "01",
"Feb.": "02",
"März": "03",
"Apr.": "04",
"Mai": "05",
"Juni": "06",
"Juli": "07",
"Aug.": "08",
"Sep.": "09",
"Okt.": "10",
"Nov.": "11",
"Dez.": "12"
}
current_year = datetime.now().year
current_month = datetime.now().month
if month in month_map:
posting_month = int(month_map[month])
# Adjusting year based on the posting month and current month
if posting_month <= current_month:
# If posting month comes before the current month, assume it's from the current year
pass
else:
# Otherwise, assume it's from the previous year
current_year -= 1
date = f"{day.zfill(2)}.{month_map[month]}.{current_year}"
else:
date = "keine Angabe"
job_type = date_type_elems[1].text if len(date_type_elems) > 1 else None
# Only appending if we have more than just the title and link
if company or date or location or job_type:
jobs_details.append((title, company, job_link, date, location, job_type, received_date))
return jobs_details
def connect_to_gmail():
print("Connecting to Gmail...")
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login('XXX@gmail.com', 'XXX')
print("Connected to Gmail.")
return mail
def fetch_emails(mail):
mail.select('inbox')
print("Fetching emails from jobalerts-noreply@google.com...")
result, email_ids = mail.search(None, '(FROM "notify-noreply@google.com")')
email_data = []
for email_id in email_ids[0].split():
result, email_content = mail.fetch(email_id, '(RFC822)')
raw_email = email_content[0][1]
email_data.append(email.message_from_bytes(raw_email))
print(f"Fetched {len(email_data)} emails.")
return email_data
def main():
mail = connect_to_gmail()
emails = fetch_emails(mail)
all_jobs = []
for msg in emails:
received_date = email.utils.parsedate_to_datetime(msg['Date'])
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
if content_type == "text/html" and "attachment" not in content_disposition:
email_content = part.get_payload(decode=True).decode()
jobs = extract_detailed_job_details(email_content, received_date)
all_jobs.extend(jobs)
print(f"Extracted {len(jobs)} jobs from email.")
break
else:
email_content = msg.get_payload(decode=True).decode()
jobs = extract_detailed_job_details(email_content, received_date)
all_jobs.extend(jobs)
print(f"Extracted {len(jobs)} jobs from email.")
df = pd.DataFrame(all_jobs, columns=['Title', 'Company', 'Link', 'Date', 'Location', 'Job Type', 'Received Date'])
df.sort_values(by='Received Date', inplace=True)
df.drop_duplicates(subset=['Title', 'Location', 'Company'], keep='first', inplace=True)
# Format the 'Received Date' column in "DD.MM.YYYY"
df['Received Date'] = df['Received Date'].dt.strftime('%d.%m.%Y')
# Fill empty Date fields with "keine Angabe"
df['Date'].fillna("keine Angabe", inplace=True)
df.to_excel("extracted_jobs.xlsx", index=False)
print(f"Saved {len(df)} unique jobs to extracted_jobs.xlsx.")
# Adding the call to main() here
if __name__ == '__main__':
main()