File tree Expand file tree Collapse file tree 3 files changed +67
-2
lines changed Expand file tree Collapse file tree 3 files changed +67
-2
lines changed Original file line number Diff line number Diff line change
1
+ import logging
2
+
3
+ from scrapy import signals
4
+
5
+
6
+ class HeaderInspectionMiddleware :
7
+ """
8
+ Middleware to inspect headers of outgoing requests and incoming responses
9
+ """
10
+
11
+ def __init__ (self ):
12
+ self .spider = None
13
+
14
+ @classmethod
15
+ def from_crawler (cls , crawler ):
16
+ middleware = cls ()
17
+ crawler .signals .connect (middleware .spider_opened , signal = signals .spider_opened )
18
+ return middleware
19
+
20
+ def spider_opened (self , spider ):
21
+ self .spider = spider
22
+
23
+ def process_request (self , request , spider ):
24
+ """
25
+ This method is called for each request that goes through the download middleware.
26
+ """
27
+ logging .debug ("\n Outgoing request to: %s" , request .url )
28
+ logging .debug ("\n Headers: %s" , request .headers )
Original file line number Diff line number Diff line change 14
14
from .documentation_spider import DocumentationSpider
15
15
from .strategies .default_strategy import DefaultStrategy
16
16
from .custom_downloader_middleware import CustomDownloaderMiddleware
17
+ from .header_inspector_middleware import HeaderInspectionMiddleware
17
18
from .custom_dupefilter import CustomDupeFilter
18
19
from .config .browser_handler import BrowserHandler
19
- from .strategies .algolia_settings import AlgoliaSettings
20
20
21
21
try :
22
22
# disable boto (S3 download)
@@ -46,6 +46,7 @@ def run_config(config):
46
46
47
47
root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
48
48
DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware .__name__
49
+ HEADER_MIDDLEWARES_PATH = root_module + 'header_inspector_middleware.' + HeaderInspectionMiddleware .__name__
49
50
DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter .__name__
50
51
51
52
headers = {
@@ -98,7 +99,7 @@ def run_config(config):
98
99
'LOG_ENABLED' : '1' ,
99
100
'LOG_LEVEL' : 'ERROR' ,
100
101
'USER_AGENT' : config .user_agent ,
101
- 'DOWNLOADER_MIDDLEWARES' : {DOWNLOADER_MIDDLEWARES_PATH : 900 },
102
+ 'DOWNLOADER_MIDDLEWARES' : {DOWNLOADER_MIDDLEWARES_PATH : 900 , HEADER_MIDDLEWARES_PATH : 901 },
102
103
# Need to be > 600 to be after the redirectMiddleware
103
104
'DUPEFILTER_USE_ANCHORS' : config .use_anchors ,
104
105
# Use our custom dupefilter in order to be scheme agnostic regarding link provided
Original file line number Diff line number Diff line change
1
+ import os
2
+ import pdb
3
+ from unittest .mock import MagicMock
4
+
5
+ import pytest
6
+ from scrapy .http import Request
7
+ from scrapy .spidermiddlewares .httperror import HttpError
8
+
9
+
10
+ @pytest .fixture
11
+ def config ():
12
+ return MagicMock (
13
+ index_name = "test_index" ,
14
+ start_urls = [{"url" : "http://example.com" }],
15
+ allowed_domains = ["example.com" ],
16
+ stop_urls = [],
17
+ js_render = False ,
18
+ )
19
+
20
+
21
+ @pytest .fixture
22
+ def env_vars (monkeypatch ):
23
+ monkeypatch .setenv ("DOCSEARCH_BASICAUTH_USERNAME" , "testuser" )
24
+ monkeypatch .setenv ("DOCSEARCH_BASICAUTH_PASSWORD" , "testpass" )
25
+ monkeypatch .setenv ("DOCSEARCH_AUTH_DOMAIN" , "http://example.com" )
26
+
27
+
28
+ def test_spider_auth_attributes (config , env_vars ):
29
+ """Test that DocumentationSpider correctly sets up Basic Auth attributes"""
30
+ from scraper .src .documentation_spider import DocumentationSpider
31
+
32
+ spider = DocumentationSpider (config = config , typesense_helper = None , strategy = None )
33
+
34
+ assert spider .http_user == "testuser"
35
+ assert spider .http_pass == "testpass"
36
+ assert spider .http_auth_domain == "http://example.com"
You can’t perform that action at this time.
0 commit comments