I'm working on a script that can scrape a list from two nearly identical websites, and as such, I'm attempting to do this using Selenium. The issue with the site I'm fetching the information from is that it loads stuff through javascript after the web page is fetched/downloaded through your browser, and selenium/pyvirtualdisplay isn't letting that javascript run to create the rest of the webpage for me to sort through.
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from sys import exit
from time import sleep
disneyland_url = "http://disneyland.disney.go.com/maps/service-details/18583410%3bentitytype%3dguest-service"
disneyworld_url = "http://disneyworld.disney.go.com/maps/service-details/18579731%3bentitytype%3dguest-service"
names_xpath = "//div[@class='textContainer']/div/text()"
coords_xpath = "//div[@class='textContainer']/parent::div/@data-id"
dlString = ""
dwString = ""
def scrape(url):
htmlString = ""
display = Display(visible=0, size=(800, 600))
display.start()
print "Displaying"
for retry in range(3):
try:
browser = webdriver.Firefox()
break
except:
sleep(3)
try:
browser.get("http://disneyland.disney.go.com/maps/service-details/18583410%3bentitytype%3dguest-service")
print browser.title
WebDriverWait(browser, 50).until(EC.presence_of_element_located((By.Class, "textContainer")))
innerHTML = browser.execute_script("return document.body.innerHTML")
htmlString = innerHTML
except:
print "Website didn't load in time"
browser.quit()
display.stop()
exit("Website Error")
finally:
browser.quit()
display.stop()
tree = html.fromstring(htmlString)
print(htmlString)
names = tree.xpath(names_xpath)
print(names)
scrape(disneyland_url)
Does anyone know what I'm doing wrong in regards to this?