Skip to content

Commit 27407d7

Browse files
authored
Merge pull request #80 from tharropoulos/wait-for-render
Enhance scraper reliability with improved page load detection
2 parents b0e5837 + 6254eb7 commit 27407d7

File tree

1 file changed

+14
-10
lines changed

1 file changed

+14
-10
lines changed

scraper/src/custom_downloader_middleware.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
"""
44

55
import time
6+
from urllib.parse import unquote_plus, urlparse
67

78
from scrapy.http import HtmlResponse
8-
from urllib.parse import urlparse, unquote_plus
9+
from selenium.common.exceptions import TimeoutException
10+
from selenium.webdriver.support.ui import WebDriverWait
911

1012

1113
class CustomDownloaderMiddleware:
@@ -27,15 +29,17 @@ def process_request(self, request, spider):
2729

2830
self.driver.get(unquote_plus(
2931
request.url)) # Decode url otherwise firefox is not happy. Ex /#%21/ => /#!/%21
30-
time.sleep(spider.js_wait)
31-
body = self.driver.page_source.encode('utf-8')
32-
url = self.driver.current_url
3332

34-
return HtmlResponse(
35-
url=url,
36-
body=body,
37-
encoding='utf8'
38-
)
33+
try:
34+
# Wait for DOM ready
35+
WebDriverWait(self.driver, 10).until(
36+
lambda d: d.execute_script("return document.readyState") == "complete"
37+
)
38+
except TimeoutException:
39+
time.sleep(spider.js_wait)
40+
41+
body = self.driver.page_source.encode("utf-8")
42+
return HtmlResponse(url=self.driver.current_url, body=body, encoding="utf8")
3943

4044
def process_response(self, request, response, spider):
4145
# Since scrappy use start_urls and stop_urls before creating the request
@@ -47,7 +51,7 @@ def process_response(self, request, response, spider):
4751
url_without_params = o.scheme + "://" + o.netloc + o.path
4852
response = response.replace(url=url_without_params)
4953

50-
if response.url == request.url + '#':
54+
if response.url == request.url + "#":
5155
response = response.replace(url=request.url)
5256

5357
return response

0 commit comments

Comments
 (0)