Issues with web scraping : Forums : PythonAnywhere

Locally it just works fine so I don't get the issues. Any advice you guys have will be of great help. TIA

print("Welcome!")

#!/usr/bin/env python
# coding: utf-8

# In[ ]:




#choice = int(input('custom job title? 0/1'))


#if choice==1:
#  keywords = input('enter a job title you want to search for :' )

#by default searching on India


records_val = 50 #int(input('enter number of records (minimum 50):'))
delay_val = 1 #float(input('enter number of delay value in seconds: (prefer 0 to 3) '))
records_val = int((records_val-50)/10 + 1)
retry_limit = 3 #int(input('enter retry limit : '))


# In[ ]:


# !pip install pyautogui
# import pyautogui
# pyautogui.moveRel(0, 10)


# removed proxy and use headers as per - https://github.com/mratanusarkar/Dataset-Indian-Companies/blob/master/scraper.py

# # Job Scrapper & Analytics
#

# In[ ]:


#for automating linkedin scrape
#(doubt here) get_ipython().system('pip install selenium')

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

#downloading and storing image
from io import BytesIO
from PIL import Image
import base64

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run Chrome in headless mode (no GUI)
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Set up ChromeDriver without specifying the executable path
driver = webdriver.Chrome(options=chrome_options)


# In[ ]:


import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import numpy as np
import re
#import spacy


# In[ ]:


header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}

#header={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}


# In[ ]:


#check before start
linkedin_working = requests.get('https://linkedin.com', headers = header)
ambition_working = requests.get('https://ambitionbox.com',headers= header )

if ambition_working.status_code==200:
  print('Ambition scrape success',ambition_working.status_code)
else:
  print('Ambition scrape fail ' ,ambition_working.status_code)

if linkedin_working.status_code==200:
  print('linkedin scrape success ',linkedin_working.status_code)
else:
  print('linkedin scrape fail ',linkedin_working.status_code)


# In[ ]:




#if choice==1:
#  link = f'https://www.linkedin.com/jobs/search?{keywords}=&location=India&locationId=&geoId=102713980&f_TPR=&f_E=5&position=1&pageNum=0'.format(keywords)
#else:
link = "https://www.linkedin.com/jobs/search?keywords=&location=India&locationId=&geoId=102713980&f_TPR=r86400&f_PP=106442238%2C105214831%2C106164952%2C105556991%2C103671728%2C104869687&position=1&pageNum=0"



page = requests.get(link)

count =0
while retry_limit<count and page.status_code != 200 and page.status_code != 404:
  time.sleep(delay_val)
  page = requests.get(link,headers=header)
  count+=1


print(page)


# In[ ]:


driver.get(link)

def scroll_and_click_show_more():
    # Scroll down the page
    flag = True
    while flag==True:
      time.sleep(5)
      driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
      time.sleep(5)  # Adjust the sleep time as needed

      try:
          # Find and click the "See more jobs" button
          show_more_button = driver.find_element(By.XPATH, "//button[@aria-label='See more jobs']")
          if show_more_button.is_displayed():
              show_more_button.click()
              show_more_button.dbclick()
              show_more_button.click()
              time.sleep(5)
              show_more_button.dbclick() #new
              show_more_button.click()
              print('show more clicked')
              # Wait for content to load
              WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'job-card-container')))
              time.sleep(5)  # Adjust the sleep time after clicking
              flag=False
          else:
            break
      except Exception as e:
          print("TimeoutException: Element not found. Continuing to the next iteration.",e)
          driver.get(link)

# Scroll the page and click "See more jobs" multiple times (adjust the range based on your needs)

for _ in range(records_val):  # Increase the scroll range for more entries
    time.sleep(10)
    scroll_and_click_show_more()


page_content = driver.page_source

# Parse the updated content using BeautifulSoup
soup = BeautifulSoup(page_content, 'html.parser')

# Extract data as before
list_of_roles = [span.text.strip() for span in soup.find_all('span', class_='sr-only')]
list_of_companies = [element.text.strip() for element in soup.find_all('h4', class_='base-search-card__subtitle')]
list_of_locations = [element.text.strip() for element in soup.find_all('span', class_='job-search-card__location')]

#list_of_logo_link = [element.text.strip() for element in soup.find_all('img', class_='artdeco-entity-image artdeco-entity-image--square-4 lazy-loaded')]



target_class = "base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]"
filtered_urls = [a['href'] for a in soup.find_all('a', class_=target_class)]
# print(filtered_urls)

# Close the browser window
driver.quit()



# In[ ]:


def job_salary_finder(job_role):
    list1 = job_role.split(' ')
    a = "-".join(list1)

    print(a)
    #ambition_sal_link = f"https://www.ambitionbox.com/profile/{a}-salary".format(a)
    ambition_sal_link = "https://www.ambitionbox.com/profile/{}-salary".format(a)




    response = requests.get(ambition_sal_link, headers=header)
    count=0
    while retry_limit<count and response.status_code != 200 and response.status_code != 404:
      time.sleep(delay_val)
      response = requests.get(ambition_sal_link,headers=header)
      count+=1

    soup = BeautifulSoup(response.content, "html.parser")

    sal_role = [span.text.strip() for span in soup.find_all('p', class_='average-salary')]


    if len(sal_role)>=1:
      match = re.search(r'₹\s*([\d.]+)\s*LPA', sal_role[0])

      if match:
        sal_role = match.group(1)


    return sal_role



#     return companyavg_sal_for_role
def Avg_Sal_asper_role_and_company(job_role, comp_name):
    list1 = comp_name.split(' ')
    comp_name = "-".join(list1)

    list1 = job_role.split(' ')
    job_role = "-".join(list1)

    #company_avg_sal_for_role = f"https://www.ambitionbox.com/salaries/{comp_name}-salaries/{job_role}?campaign=salaries_widget"
    company_avg_sal_for_role = "https://www.ambitionbox.com/salaries/{}-salaries/{}?campaign=salaries_widget".format(comp_name, job_role)


    response = requests.get(company_avg_sal_for_role, headers=header)
    count =0
    while retry_limit<count and response.status_code != 200 and response.status_code != 404:
        time.sleep(delay_val)
        response = requests.get(company_avg_sal_for_role, headers=header)
        count+=1

    soup = BeautifulSoup(response.content, "html.parser")
    company_avg_sal_for_role = [span.text.strip() for span in soup.find_all(class_='row-left__salary bold-display')]

    if len(company_avg_sal_for_role) >= 1:
        match = re.search(r'₹\s*([\d.]+)\s*LPA', company_avg_sal_for_role[0])

        if match:
            company_avg_sal_for_role = match.group(1)

    return company_avg_sal_for_role


# In[ ]:


def company_overview(compname):
    #comp_overview = f"https://www.ambitionbox.com/overview/{compname}-overview".format(compname)
    comp_overview = "https://www.ambitionbox.com/overview/{}-overview".format(compname)

    response = requests.get(comp_overview, headers=header, )

    count = 0
    while retry_limit<count and response.status_code != 200 and response.status_code != 404:
      time.sleep(delay_val)
      response = requests.get(comp_overview,headers=header)
      count+=1

    soup = BeautifulSoup(response.content, "html.parser")

    foundedyear = [span.text.strip() for span in soup.find_all('p',class_='textItem__val aboutItem__value')]
    if len(foundedyear)>=1:
        foundedyear = foundedyear[0]
    ownership = [span.text.strip() for span in soup.find_all('a',class_='aboutItem__link')]
    if len(ownership)>=1:
        ownership = ownership[0]
        if '.' in ownership:
          ownership = ""
    employee_count = [span.text.strip() for span in soup.find_all(class_='textItem__val aboutItem__value')]
    if len(employee_count)>=3:
        employee_count = employee_count[2]
    headquater = [span.text.strip() for span in soup.find_all(class_='textItem__val aboutItem__value')]
    if len(headquater)>=5:
        headquater = headquater[4]

    offices = [span.text.strip() for span in soup.find_all(class_='aboutItem__value flex-row')]
    compweblink = [span.text.strip() for span in soup.find_all(class_='textItem__val aboutItem__value aboutItem__website')]

    #New Addition
    percentage_divs = soup.find_all('div', class_="_5o8sL sbold-list-header")

    # Find all divs containing the names
    name_divs = soup.find_all('div', class_="_3Vkds +BYOL body-medium")

    # Initialize an empty list to store formatted strings
    formatted_strings = []

    # Iterate through each div containing percentages
    for i in range(len(percentage_divs)):
        # Extract the percentage
        percentage = percentage_divs[i].get_text(strip=True)
        # Extract the name
        name = name_divs[i].get_text(strip=True)
        # Append the formatted string to the list
        formatted_strings.append(f"{name}: {percentage}")

    # Join the formatted strings into a single string
    best_apply_mode = "; ".join(formatted_strings)

    # print("best_apply_mode",best_apply_mode)


    reviews_element = soup.find('p', class_="newHInfo__rc")

    # Extract the reviews text or assign an empty string if not found
    if reviews_element:
        reviews_text = reviews_element.get_text(strip=True)
    else:
        reviews_text = ""

    rating_divs = soup.find_all('div', class_='avg_review_item clickable')

    # Initialize an empty list to store formatted strings
    formatted_ratings = []

    # Iterate through each rating div
    for div in rating_divs:
        # Extract the rating value if present
        rating_element = div.find('p', class_='card-rating bold-list-header')
        rating = rating_element.get_text(strip=True) if rating_element else ""
        # Extract the rating category if present
        category_element = div.find('p', class_='body-medium rating-name clickable-rating')
        category = category_element.get_text(strip=True) if category_element else ""
        # Append the formatted string to the list
        formatted_ratings.append(f"{category}: {rating}")

    # Join the formatted strings into a single string
    ratings_string = "; ".join(formatted_ratings)

    print(ratings_string)


    return {
        'founded_year': foundedyear,
        'ownershipType': ownership,
        'Employee_count': employee_count,
        'HQ_location': headquater,
        'offices_loc': offices,
        'web_link': compweblink,
        'best_apply_mode' : best_apply_mode,
        'total_reviews' : reviews_text,
        'sub_ratings' : ratings_string
    }




# In[ ]:


def company_ratings(compname):
  #comp_ratelink = f"https://www.ambitionbox.com/reviews/{compname}-reviews".format(compname)
  comp_ratelink = "https://www.ambitionbox.com/reviews/{}-reviews".format(compname)

  response = requests.get(comp_ratelink , headers=header,)
  count=0
  while retry_limit<count and response.status_code != 200 and response.status_code != 404:
    time.sleep(delay_val)
    response = requests.get(comp_ratelink,headers=header)
    count+=1

  soup = BeautifulSoup(response.content, "html.parser")

  comprate = [span.text.strip() for span in soup.find_all('p',class_='rating-val bold-title')]

  return comprate


# In[ ]:


def get_JD(link, retry_attempts=5):
    try_count = 0
    while try_count < retry_attempts:
        try:
            # Introduce a delay to avoid rate limiting
            time.sleep(delay_val + 1)  # You can adjust the sleep duration based on your needs

            # Fetch the job page content
            response = requests.get(link, headers=header)

            if response.status_code == 200:
                page_content = response.text
                soup = BeautifulSoup(page_content, 'html.parser')

                # Find image link
                img_tags = soup.find_all('img')[1]
                data_ghost_url = img_tags.get('data-delayed-url')

                if len(data_ghost_url) <= 2:
                    data_ghost_url = "https://i.imgur.com/hCeFbz7.jpeg"

                # Extracting company industry type
                job_criteria_text = soup.find_all('span', class_='description__job-criteria-text--criteria')[3].text.strip()

                # Find the job description text only doing for 200 words without clicking show more button
                job_description_div = soup.find('div', {'class': 'description__text'})

                # Extract the job description text
                job_description_text = job_description_div.get_text(strip=True) if job_description_div else ""

                if len(job_description_text) >= 50:
                    job_description_text = job_description_text[:450]
                    job_description_text += "...."

                criteria_items = soup.find_all('li', class_='description__job-criteria-item')

                # Initialize an empty list to store formatted criteria
                formatted_criteria = []

                # Iterate through each criteria item
                for item in criteria_items:
                    # Extract the criteria label
                    label = item.find('h3', class_='description__job-criteria-subheader').get_text(strip=True)
                    # Extract the criteria value
                    value = item.find('span', class_='description__job-criteria-text').get_text(strip=True)
                    # Append the formatted criteria to the list
                    formatted_criteria.append(f"{label}: {value}")

                # Join the formatted criteria into a string
                criteria_string = "; ".join(formatted_criteria)

                # print(criteria_string)


                return [job_description_text, data_ghost_url, job_criteria_text]
            elif response.status_code == 429:  # Too Many Requests
                print("Too Many Requests. Retrying...")
                try_count += 1
                continue
            else:
                #print(f"Failed to fetch the job page. Status Code: {response.status_code}")
                print("Failed to fetch the job page. Status Code: {}".format(response.status_code))
                #print(f"Link failed: {link}")
                print("Link failed: {}".format(link))
                return ["", "https://i.imgur.com/hCeFbz7.jpeg", ""]
        except Exception as e:
            #print(f"Error processing link: {link}")
            print("Error processing link: {}".format(link))
            #print(f"Error details: {e}")
            print("Error details: {}".format(e))
            return ["", "", ""]
    return ["", "https://i.imgur.com/hCeFbz7.jpeg", ""]


# In[ ]:


from datetime import datetime

#get_ipython().system('pip install pymongo')
from pymongo import MongoClient

list_of_roles = list_of_roles[2:-1] #some extra headers

#For self checking can comment prints below
print(len(list_of_roles))
print(len(list_of_companies))
print(len(list_of_locations))
print(len(filtered_urls))

#creating the dataframe

min_length = min(len(list_of_roles), len(list_of_companies), len(list_of_locations), len(filtered_urls))

from pymongo import MongoClient
from datetime import datetime

# Connect to MongoDB Atlas
connection_string = "mongodb+srv://************:cw**************@cluster0.g0eb8jw.mongodb.net/LI_DB?retryWrites=true&w=majority"
client = MongoClient(connection_string)

# Select the Database and Collection
db = client['LI_DB']
collection = db['LI_Collection']

# Iterate over each record and insert into MongoDB
for i in range(0,min_length):
    # Call the functions for each record individually
    avg_sal_role = job_salary_finder(list_of_roles[i])
    job_desc, image_url, industry_type = get_JD(filtered_urls[i])
    avg_sal_role_and_comp = Avg_Sal_asper_role_and_company(list_of_roles[i], list_of_companies[i])
    avg_sal_role_and_comp = (
        (float(re.search(r'₹?(\d+(\.\d+)?)', str(avg_sal_role_and_comp)).group(1)) +
        float(re.search(r'₹?(\d+(\.\d+)?)', str(avg_sal_role_and_comp).split(' - ')[-1]).group(1))
        ) / 2 if re.search(r'₹?(\d+(\.\d+)?)', str(avg_sal_role_and_comp)) else None
    )
    company_rate = company_ratings(list_of_companies[i])
    company_info = company_overview(list_of_companies[i])

    # Create a dictionary to hold the entry data
    entry = {
        'Job_Role': list_of_roles[i],
        'Company_Name': list_of_companies[i],
        'Job_location': list_of_locations[i],
        'Job_link': filtered_urls[i],
        'Avg_Sal_asper_role': avg_sal_role,
        'JobDesc': job_desc,
        'ImageURL': image_url,
        'IndustryType': industry_type,
        'Avg_Sal_asper_role_and_comp': avg_sal_role_and_comp,
        'Company_rating': company_rate,
        'Founded_Year': company_info['founded_year'],
        'Ownership_Type': company_info['ownershipType'],
        'Employee_Count': company_info['Employee_count'],
        'HQ_Location': company_info['HQ_location'],
        'Offices_Location': company_info['offices_loc'],
        'Website_Link': company_info['web_link'],
        'best_apply_mode' : company_info['best_apply_mode'],
        'total_reviews': company_info['total_reviews'],
        'sub_ratings' : company_info['sub_ratings'],
        'criteria_string': criteria_string,

        # Add other fields as needed
        'Fetch_DT': datetime.now()
    }

    # Insert the entry into MongoDB
    collection.insert_one(entry)

# Close the MongoDB client
client.close()

These are the errors I get:

(myenv) 16:19 ~ $ python3 jobscrapperanalyticsMAIN.py Welcome! Traceback (most recent call last): File "/home/thanosmayberight/jobscrapperanalyticsMAIN.py", line 66, in <module> driver = webdriver.Chrome(options=chrome_options) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/chrome/webdriver.py", line 70, in init super(WebDriver, self).init(DesiredCapabilities.CHROME['browserName'], "goog",
File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/chromium/webdriver.py", line 93, in init RemoteWebDriver.init( File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 269, in init self.start_session(capabilities, browser_profile) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 360, in start_session response = self.execute(Command.NEW_SESSION, parameters) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 425, in execute self.error_handler.check_response(response) File "/home/thanosmayberight/myenv/lib/python3.10/site-packages/selenium/webdriver/remote/errorhandler.py", line 247, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.SessionNotCreatedException: Message: session not created from timeout: Timed out receiving message from renderer: 600.000 (Session info: headless chrome=90.0.4430.212) Stacktrace:

0 0x55eb0fe2ee89 <unknown> (myenv) 10:51 ~ $

everything is installed nicely.

selenium beautifulsoup4 bs4 pandas numpy pymongo Pillow nltk spacy playwright requests

I picked python 3.10 and that was also used in the local env. I'm so confused. Sorry for the n00b question. I earlier wanted to use Azure but couldn't figure it out. So this place is the last resort.

thanosmayberight | 2 posts | April 21, 2024, 10:57 a.m. | permalink