Skip to content

Commit 678d77b

Browse files
authored
Merge pull request #79 from tharropoulos/header-middleware
Add header inspection tools
2 parents 27407d7 + 2256baf commit 678d77b

File tree

3 files changed

+67
-2
lines changed

3 files changed

+67
-2
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import logging
2+
3+
from scrapy import signals
4+
5+
6+
class HeaderInspectionMiddleware:
7+
"""
8+
Middleware to inspect headers of outgoing requests and incoming responses
9+
"""
10+
11+
def __init__(self):
12+
self.spider = None
13+
14+
@classmethod
15+
def from_crawler(cls, crawler):
16+
middleware = cls()
17+
crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened)
18+
return middleware
19+
20+
def spider_opened(self, spider):
21+
self.spider = spider
22+
23+
def process_request(self, request, spider):
24+
"""
25+
This method is called for each request that goes through the download middleware.
26+
"""
27+
logging.debug("\nOutgoing request to: %s", request.url)
28+
logging.debug("\nHeaders: %s", request.headers)

scraper/src/index.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
from .documentation_spider import DocumentationSpider
1515
from .strategies.default_strategy import DefaultStrategy
1616
from .custom_downloader_middleware import CustomDownloaderMiddleware
17+
from .header_inspector_middleware import HeaderInspectionMiddleware
1718
from .custom_dupefilter import CustomDupeFilter
1819
from .config.browser_handler import BrowserHandler
19-
from .strategies.algolia_settings import AlgoliaSettings
2020

2121
try:
2222
# disable boto (S3 download)
@@ -46,6 +46,7 @@ def run_config(config):
4646

4747
root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
4848
DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__
49+
HEADER_MIDDLEWARES_PATH = root_module + 'header_inspector_middleware.' + HeaderInspectionMiddleware.__name__
4950
DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__
5051

5152
headers = {
@@ -98,7 +99,7 @@ def run_config(config):
9899
'LOG_ENABLED': '1',
99100
'LOG_LEVEL': 'ERROR',
100101
'USER_AGENT': config.user_agent,
101-
'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
102+
'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900, HEADER_MIDDLEWARES_PATH: 901},
102103
# Need to be > 600 to be after the redirectMiddleware
103104
'DUPEFILTER_USE_ANCHORS': config.use_anchors,
104105
# Use our custom dupefilter in order to be scheme agnostic regarding link provided
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
import pdb
3+
from unittest.mock import MagicMock
4+
5+
import pytest
6+
from scrapy.http import Request
7+
from scrapy.spidermiddlewares.httperror import HttpError
8+
9+
10+
@pytest.fixture
11+
def config():
12+
return MagicMock(
13+
index_name="test_index",
14+
start_urls=[{"url": "http://example.com"}],
15+
allowed_domains=["example.com"],
16+
stop_urls=[],
17+
js_render=False,
18+
)
19+
20+
21+
@pytest.fixture
22+
def env_vars(monkeypatch):
23+
monkeypatch.setenv("DOCSEARCH_BASICAUTH_USERNAME", "testuser")
24+
monkeypatch.setenv("DOCSEARCH_BASICAUTH_PASSWORD", "testpass")
25+
monkeypatch.setenv("DOCSEARCH_AUTH_DOMAIN", "http://example.com")
26+
27+
28+
def test_spider_auth_attributes(config, env_vars):
29+
"""Test that DocumentationSpider correctly sets up Basic Auth attributes"""
30+
from scraper.src.documentation_spider import DocumentationSpider
31+
32+
spider = DocumentationSpider(config=config, typesense_helper=None, strategy=None)
33+
34+
assert spider.http_user == "testuser"
35+
assert spider.http_pass == "testpass"
36+
assert spider.http_auth_domain == "http://example.com"

0 commit comments

Comments
 (0)