3
3
"""
4
4
5
5
import time
6
+ from urllib .parse import unquote_plus , urlparse
6
7
7
8
from scrapy .http import HtmlResponse
8
- from urllib .parse import urlparse , unquote_plus
9
+ from selenium .common .exceptions import TimeoutException
10
+ from selenium .webdriver .support .ui import WebDriverWait
9
11
10
12
11
13
class CustomDownloaderMiddleware :
@@ -27,15 +29,17 @@ def process_request(self, request, spider):
27
29
28
30
self .driver .get (unquote_plus (
29
31
request .url )) # Decode url otherwise firefox is not happy. Ex /#%21/ => /#!/%21
30
- time .sleep (spider .js_wait )
31
- body = self .driver .page_source .encode ('utf-8' )
32
- url = self .driver .current_url
33
32
34
- return HtmlResponse (
35
- url = url ,
36
- body = body ,
37
- encoding = 'utf8'
38
- )
33
+ try :
34
+ # Wait for DOM ready
35
+ WebDriverWait (self .driver , 10 ).until (
36
+ lambda d : d .execute_script ("return document.readyState" ) == "complete"
37
+ )
38
+ except TimeoutException :
39
+ time .sleep (spider .js_wait )
40
+
41
+ body = self .driver .page_source .encode ("utf-8" )
42
+ return HtmlResponse (url = self .driver .current_url , body = body , encoding = "utf8" )
39
43
40
44
def process_response (self , request , response , spider ):
41
45
# Since scrappy use start_urls and stop_urls before creating the request
@@ -47,7 +51,7 @@ def process_response(self, request, response, spider):
47
51
url_without_params = o .scheme + "://" + o .netloc + o .path
48
52
response = response .replace (url = url_without_params )
49
53
50
- if response .url == request .url + '#' :
54
+ if response .url == request .url + "#" :
51
55
response = response .replace (url = request .url )
52
56
53
57
return response
0 commit comments