Locally it just works fine so I don't get the issues. Any advice you guys have will be of great help. TIA
print("Welcome!")
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
#choice = int(input('custom job title? 0/1'))
#if choice==1:
# keywords = input('enter a job title you want to search for :' )
#by default searching on India
records_val = 50 #int(input('enter number of records (minimum 50):'))
delay_val = 1 #float(input('enter number of delay value in seconds: (prefer 0 to 3) '))
records_val = int((records_val-50)/10 + 1)
retry_limit = 3 #int(input('enter retry limit : '))
# In[ ]:
# !pip install pyautogui
# import pyautogui
# pyautogui.moveRel(0, 10)
# removed proxy and use headers as per - https://github.com/mratanusarkar/Dataset-Indian-Companies/blob/master/scraper.py
# # Job Scrapper & Analytics
#
# In[ ]:
#for automating linkedin scrape
#(doubt here) get_ipython().system('pip install selenium')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
#downloading and storing image
from io import BytesIO
from PIL import Image
import base64
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless') # Run Chrome in headless mode (no GUI)
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# Set up ChromeDriver without specifying the executable path
driver = webdriver.Chrome(options=chrome_options)
# In[ ]:
import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import numpy as np
import re
#import spacy
# In[ ]:
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
#header={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
# In[ ]:
#check before start
linkedin_working = requests.get('https://linkedin.com', headers = header)
ambition_working = requests.get('https://ambitionbox.com',headers= header )
if ambition_working.status_code==200:
print('Ambition scrape success',ambition_working.status_code)
else:
print('Ambition scrape fail ' ,ambition_working.status_code)
if linkedin_working.status_code==200:
print('linkedin scrape success ',linkedin_working.status_code)
else:
print('linkedin scrape fail ',linkedin_working.status_code)
# In[ ]:
#if choice==1:
# link = f'https://www.linkedin.com/jobs/search?{keywords}=&location=India&locationId=&geoId=102713980&f_TPR=&f_E=5&position=1&pageNum=0'.format(keywords)
#else:
link = "https://www.linkedin.com/jobs/search?keywords=&location=India&locationId=&geoId=102713980&f_TPR=r86400&f_PP=106442238%2C105214831%2C106164952%2C105556991%2C103671728%2C104869687&position=1&pageNum=0"
page = requests.get(link)
count =0
while retry_limit<count and page.status_code != 200 and page.status_code != 404:
time.sleep(delay_val)
page = requests.get(link,headers=header)
count+=1
print(page)
# In[ ]:
driver.get(link)
def scroll_and_click_show_more():
# Scroll down the page
flag = True
while flag==True:
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # Adjust the sleep time as needed
try:
# Find and click the "See more jobs" button
show_more_button = driver.find_element(By.XPATH, "//button[@aria-label='See more jobs']")
if show_more_button.is_displayed():
show_more_button.click()
show_more_button.dbclick()
show_more_button.click()
time.sleep(5)
show_more_button.dbclick() #new
show_more_button.click()
print('show more clicked')
# Wait for content to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'job-card-container')))
time.sleep(5) # Adjust the sleep time after clicking
flag=False
else:
break
except Exception as e:
print("TimeoutException: Element not found. Continuing to the next iteration.",e)
driver.get(link)
# Scroll the page and click "See more jobs" multiple times (adjust the range based on your needs)
for _ in range(records_val): # Increase the scroll range for more entries
time.sleep(10)
scroll_and_click_show_more()
page_content = driver.page_source
# Parse the updated content using BeautifulSoup
soup = BeautifulSoup(page_content, 'html.parser')
# Extract data as before
list_of_roles = [span.text.strip() for span in soup.find_all('span', class_='sr-only')]
list_of_companies = [element.text.strip() for element in soup.find_all('h4', class_='base-search-card__subtitle')]
list_of_locations = [element.text.strip() for element in soup.find_all('span', class_='job-search-card__location')]
#list_of_logo_link = [element.text.strip() for element in soup.find_all('img', class_='artdeco-entity-image artdeco-entity-image--square-4 lazy-loaded')]
target_class = "base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]"
filtered_urls = [a['href'] for a in soup.find_all('a', class_=target_class)]
# print(filtered_urls)
# Close the browser window
driver.quit()
# In[ ]:
def job_salary_finder(job_role):
list1 = job_role.split(' ')
a = "-".join(list1)
print(a)
#ambition_sal_link = f"https://www.ambitionbox.com/profile/{a}-salary".format(a)
ambition_sal_link = "https://www.ambitionbox.com/profile/{}-salary".format(a)
response = requests.get(ambition_sal_link, headers=header)
count=0
while retry_limit<count and response.status_code != 200 and response.status_code != 404:
time.sleep(delay_val)
response = requests.get(ambition_sal_link,headers=header)
count+=1
soup = BeautifulSoup(response.content, "html.parser")
sal_role = [span.text.strip() for span in soup.find_all('p', class_='average-salary')]
if len(sal_role)>=1:
match = re.search(r'₹\s*([\d.]+)\s*LPA', sal_role[0])
if match:
sal_role = match.group(1)
return sal_role
# return companyavg_sal_for_role
def Avg_Sal_asper_role_and_company(job_role, comp_name):
list1 = comp_name.split(' ')
comp_name = "-".join(list1)
list1 = job_role.split(' ')
job_role = "-".join(list1)
#company_avg_sal_for_role = f"https://www.ambitionbox.com/salaries/{comp_name}-salaries/{job_role}?campaign=salaries_widget"
company_avg_sal_for_role = "https://www.ambitionbox.com/salaries/{}-salaries/{}?campaign=salaries_widget".format(comp_name, job_role)
response = requests.get(company_avg_sal_for_role, headers=header)
count =0
while retry_limit<count and response.status_code != 200 and response.status_code != 404:
time.sleep(delay_val)
response = requests.get(company_avg_sal_for_role, headers=header)
count+=1
soup = BeautifulSoup(response.content, "html.parser")
company_avg_sal_for_role = [span.text.strip() for span in soup.find_all(class_='row-left__salary bold-display')]
if len(company_avg_sal_for_role) >= 1:
match = re.search(r'₹\s*([\d.]+)\s*LPA', company_avg_sal_for_role[0])
if match:
company_avg_sal_for_role = match.group(1)
return company_avg_sal_for_role
# In[ ]:
def company_overview(compname):
#comp_overview = f"https://www.ambitionbox.com/overview/{compname}-overview".format(compname)
comp_overview = "https://www.ambitionbox.com/overview/{}-overview".format(compname)
response = requests.get(comp_overview, headers=header, )
count = 0
while retry_limit<count and response.status_code != 200 and response.status_code != 404:
time.sleep(delay_val)
response = requests.get(comp_overview,headers=header)
count+=1
soup = BeautifulSoup(response.content, "html.parser")
foundedyear = [span.text.strip() for span in soup.find_all('p',class_='textItem__val aboutItem__value')]
if len(foundedyear)>=1:
foundedyear = foundedyear[0]
ownership = [span.text.strip() for span in soup.find_all('a',class_='aboutItem__link')]
if len(ownership)>=1:
ownership = ownership[0]
if '.' in ownership:
ownership = ""
employee_count = [span.text.strip() for span in soup.find_all(class_='textItem__val aboutItem__value')]
if len(employee_count)>=3:
employee_count = employee_count[2]
headquater = [span.text.strip() for span in soup.find_all(class_='textItem__val aboutItem__value')]
if len(headquater)>=5:
headquater = headquater[4]
offices = [span.text.strip() for span in soup.find_all(class_='aboutItem__value flex-row')]
compweblink = [span.text.strip() for span in soup.find_all(class_='textItem__val aboutItem__value aboutItem__website')]
#New Addition
percentage_divs = soup.find_all('div', class_="_5o8sL sbold-list-header")
# Find all divs containing the names
name_divs = soup.find_all('div', class_="_3Vkds +BYOL body-medium")
# Initialize an empty list to store formatted strings
formatted_strings = []
# Iterate through each div containing percentages
for i in range(len(percentage_divs)):
# Extract the percentage
percentage = percentage_divs[i].get_text(strip=True)
# Extract the name
name = name_divs[i].get_text(strip=True)
# Append the formatted string to the list
formatted_strings.append(f"{name}: {percentage}")
# Join the formatted strings into a single string
best_apply_mode = "; ".join(formatted_strings)
# print("best_apply_mode",best_apply_mode)
reviews_element = soup.find('p', class_="newHInfo__rc")
# Extract the reviews text or assign an empty string if not found
if reviews_element:
reviews_text = reviews_element.get_text(strip=True)
else:
reviews_text = ""
rating_divs = soup.find_all('div', class_='avg_review_item clickable')
# Initialize an empty list to store formatted strings
formatted_ratings = []
# Iterate through each rating div
for div in rating_divs:
# Extract the rating value if present
rating_element = div.find('p', class_='card-rating bold-list-header')
rating = rating_element.get_text(strip=True) if rating_element else ""
# Extract the rating category if present
category_element = div.find('p', class_='body-medium rating-name clickable-rating')
category = category_element.get_text(strip=True) if category_element else ""
# Append the formatted string to the list
formatted_ratings.append(f"{category}: {rating}")
# Join the formatted strings into a single string
ratings_string = "; ".join(formatted_ratings)
print(ratings_string)
return {
'founded_year': foundedyear,
'ownershipType': ownership,
'Employee_count': employee_count,
'HQ_location': headquater,
'offices_loc': offices,
'web_link': compweblink,
'best_apply_mode' : best_apply_mode,
'total_reviews' : reviews_text,
'sub_ratings' : ratings_string
}
# In[ ]:
def company_ratings(compname):
#comp_ratelink = f"https://www.ambitionbox.com/reviews/{compname}-reviews".format(compname)
comp_ratelink = "https://www.ambitionbox.com/reviews/{}-reviews".format(compname)
response = requests.get(comp_ratelink , headers=header,)
count=0
while retry_limit<count and response.status_code != 200 and response.status_code != 404:
time.sleep(delay_val)
response = requests.get(comp_ratelink,headers=header)
count+=1
soup = BeautifulSoup(response.content, "html.parser")
comprate = [span.text.strip() for span in soup.find_all('p',class_='rating-val bold-title')]
return comprate
# In[ ]:
def get_JD(link, retry_attempts=5):
try_count = 0
while try_count < retry_attempts:
try:
# Introduce a delay to avoid rate limiting
time.sleep(delay_val + 1) # You can adjust the sleep duration based on your needs
# Fetch the job page content
response = requests.get(link, headers=header)
if response.status_code == 200:
page_content = response.text
soup = BeautifulSoup(page_content, 'html.parser')
# Find image link
img_tags = soup.find_all('img')[1]
data_ghost_url = img_tags.get('data-delayed-url')
if len(data_ghost_url) <= 2:
data_ghost_url = "https://i.imgur.com/hCeFbz7.jpeg"
# Extracting company industry type
job_criteria_text = soup.find_all('span', class_='description__job-criteria-text--criteria')[3].text.strip()
# Find the job description text only doing for 200 words without clicking show more button
job_description_div = soup.find('div', {'class': 'description__text'})
# Extract the job description text
job_description_text = job_description_div.get_text(strip=True) if job_description_div else ""
if len(job_description_text) >= 50:
job_description_text = job_description_text[:450]
job_description_text += "...."
criteria_items = soup.find_all('li', class_='description__job-criteria-item')
# Initialize an empty list to store formatted criteria
formatted_criteria = []
# Iterate through each criteria item
for item in criteria_items:
# Extract the criteria label
label = item.find('h3', class_='description__job-criteria-subheader').get_text(strip=True)
# Extract the criteria value
value = item.find('span', class_='description__job-criteria-text').get_text(strip=True)
# Append the formatted criteria to the list
formatted_criteria.append(f"{label}: {value}")
# Join the formatted criteria into a string
criteria_string = "; ".join(formatted_criteria)
# print(criteria_string)
return [job_description_text, data_ghost_url, job_criteria_text]
elif response.status_code == 429: # Too Many Requests
print("Too Many Requests. Retrying...")
try_count += 1
continue
else:
#print(f"Failed to fetch the job page. Status Code: {response.status_code}")
print("Failed to fetch the job page. Status Code: {}".format(response.status_code))
#print(f"Link failed: {link}")
print("Link failed: {}".format(link))
return ["", "https://i.imgur.com/hCeFbz7.jpeg", ""]
except Exception as e:
#print(f"Error processing link: {link}")
print("Error processing link: {}".format(link))
#print(f"Error details: {e}")
print("Error details: {}".format(e))
return ["", "", ""]
return ["", "https://i.imgur.com/hCeFbz7.jpeg", ""]
# In[ ]:
from datetime import datetime
#get_ipython().system('pip install pymongo')
from pymongo import MongoClient
list_of_roles = list_of_roles[2:-1] #some extra headers
#For self checking can comment prints below
print(len(list_of_roles))
print(len(list_of_companies))
print(len(list_of_locations))
print(len(filtered_urls))
#creating the dataframe
min_length = min(len(list_of_roles), len(list_of_companies), len(list_of_locations), len(filtered_urls))
from pymongo import MongoClient
from datetime import datetime
# Connect to MongoDB Atlas
connection_string = "mongodb+srv://************:cw**************@cluster0.g0eb8jw.mongodb.net/LI_DB?retryWrites=true&w=majority"
client = MongoClient(connection_string)
# Select the Database and Collection
db = client['LI_DB']
collection = db['LI_Collection']
# Iterate over each record and insert into MongoDB
for i in range(0,min_length):
# Call the functions for each record individually
avg_sal_role = job_salary_finder(list_of_roles[i])
job_desc, image_url, industry_type = get_JD(filtered_urls[i])
avg_sal_role_and_comp = Avg_Sal_asper_role_and_company(list_of_roles[i], list_of_companies[i])
avg_sal_role_and_comp = (
(float(re.search(r'₹?(\d+(\.\d+)?)', str(avg_sal_role_and_comp)).group(1)) +
float(re.search(r'₹?(\d+(\.\d+)?)', str(avg_sal_role_and_comp).split(' - ')[-1]).group(1))
) / 2 if re.search(r'₹?(\d+(\.\d+)?)', str(avg_sal_role_and_comp)) else None
)
company_rate = company_ratings(list_of_companies[i])
company_info = company_overview(list_of_companies[i])
# Create a dictionary to hold the entry data
entry = {
'Job_Role': list_of_roles[i],
'Company_Name': list_of_companies[i],
'Job_location': list_of_locations[i],
'Job_link': filtered_urls[i],
'Avg_Sal_asper_role': avg_sal_role,
'JobDesc': job_desc,
'ImageURL': image_url,
'IndustryType': industry_type,
'Avg_Sal_asper_role_and_comp': avg_sal_role_and_comp,
'Company_rating': company_rate,
'Founded_Year': company_info['founded_year'],
'Ownership_Type': company_info['ownershipType'],
'Employee_Count': company_info['Employee_count'],
'HQ_Location': company_info['HQ_location'],
'Offices_Location': company_info['offices_loc'],
'Website_Link': company_info['web_link'],
'best_apply_mode' : company_info['best_apply_mode'],
'total_reviews': company_info['total_reviews'],
'sub_ratings' : company_info['sub_ratings'],
'criteria_string': criteria_string,
# Add other fields as needed
'Fetch_DT': datetime.now()
}
# Insert the entry into MongoDB
collection.insert_one(entry)
# Close the MongoDB client
client.close()
These are the errors I get:
(myenv) 16:19 ~ $ python3 jobscrapperanalyticsMAIN.py Welcome! Traceback (most recent call last): File "/home/thanosmayberight/jobscrapperanalyticsMAIN.py", line 66, in <module> driver = webdriver.Chrome(options=chrome_options) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/chrome/webdriver.py", line 70, in init super(WebDriver, self).init(DesiredCapabilities.CHROME['browserName'], "goog",
File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/chromium/webdriver.py", line 93, in init RemoteWebDriver.init( File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 269, in init self.start_session(capabilities, browser_profile) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 360, in start_session response = self.execute(Command.NEW_SESSION, parameters) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 425, in execute self.error_handler.check_response(response) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/errorhandler.py", line 247, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.SessionNotCreatedException: Message: session not created from timeout: Timed out receiving message from renderer: 600.000 (Session info: headless chrome=90.0.4430.212) Stacktrace:0 0x55eb0fe2ee89 <unknown> (myenv) 10:51 ~ $
everything is installed nicely.
selenium beautifulsoup4 bs4 pandas numpy pymongo Pillow nltk spacy playwright requests
I picked python 3.10 and that was also used in the local env. I'm so confused. Sorry for the n00b question. I earlier wanted to use Azure but couldn't figure it out. So this place is the last resort.