the crawler is collecting data slowly and memory usage is increasing #1190

ycq0125 · 2025-05-06T06:19:55Z

ycq0125
May 6, 2025

The following things are happening with the code below: the crawler is collecting data slowly and memory usage is increasing. I would like to know how to optimize it.


#!/usr/bin/env python 
# -*- coding: utf-8 -*-
import json
import redis
import asyncio
import time  # Add this import
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee import ConcurrencySettings, service_locator
from crawlee.http_clients import HttpxHttpClient
from universal_crawler.routes import crawler_router
from crawlee.proxy_configuration import ProxyConfiguration
import aioredis
from conf.settings import settings, apify_logger
from conf.db_config import redis_config

# Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run.
# This makes the scraper continue from where it left off in the previous run.
# The recommended way to achieve this behavior is setting the environment variable
# `CRAWLEE_PURGE_ON_START=0`
configuration = service_locator.get_configuration()
# configuration.purge_on_start = False
configuration.write_metadata = False
configuration.persist_storage = False
configuration.available_memory_ratio = 0.8


class HostCrawler:
    def __init__(self, ):
        pool = aioredis.ConnectionPool(
            host=redis_config.host,
            port=redis_config.port,
            username=redis_config.username,
            password=redis_config.password,
            db=redis_config.db,
            max_connections=50,
        )
        self.redis_client = aioredis.Redis(connection_pool=pool)
        self.concurrency_settings = settings.CONCURRENCY_SETTINGS

    async def get_host_from_redis(self):
        task_info = await self.redis_client.spop(redis_config.batch_host_queue)
        if task_info:
            if isinstance(task_info, bytes):
                task_info = task_info.decode('utf-8').strip()
            task_info = json.loads(task_info)
            host = task_info['host']
            proxy_urls = task_info['proxy_urls']
            return host.strip(), proxy_urls
        return None, None

    async def set_finished_host(self, host):
        try:
            await self.redis_client.sadd(redis_config.batch_host_finished, host)
        except Exception as e:
            apify_logger.error(e)

    @staticmethod
    async def check_crawl_timeout(crawler):
        while True:
            await asyncio.sleep(10)
            if time.time() - crawler_router.get_last_crawl_time() > 3 * 60:
                apify_logger.info("No data crawled for 3 minutes. Stopping crawler.")
                crawler.stop()
                return True
            if not crawler._running:
                return False

    async def crawl_host(self, host, proxy_urls=None):
        http_client = HttpxHttpClient(
            persist_cookies_per_session=False,
            timeout=30,
            follow_redirects=True,
        )
        proxy_configuration = None
        if proxy_urls:
            proxy_configuration = ProxyConfiguration(
                proxy_urls=proxy_urls
            )
        crawler = BeautifulSoupCrawler(
            request_handler=crawler_router.router,
            concurrency_settings=self.concurrency_settings,
            http_client=http_client,
            proxy_configuration=proxy_configuration,
            max_request_retries=3,
        )
        crawler_task = asyncio.create_task(crawler.run([host]))
        timeout_task = asyncio.create_task(self.check_crawl_timeout(crawler))

        done, pending = await asyncio.wait(
            [crawler_task, timeout_task],
            return_when=asyncio.FIRST_COMPLETED
        )

        for task in pending:
            task.cancel()

        apify_logger.info(f"host {host} finished")

    async def run(self):
        while True:
            host, proxy_urls = await self.get_host_from_redis()
            if not host:
                apify_logger.info("task empty")
                await asyncio.sleep(10)
                continue
            try:
                await self.crawl_host(host, proxy_urls=proxy_urls)
                await self.set_finished_host(host)
            except Exception as e:
                apify_logger.error(f"run host {host} failed: {e}")


async def main():
    crawler = HostCrawler()
    await crawler.run()

Answered by janbuchar

May 7, 2025

Hi @ycq0125! I noticed that you're making a new BeautifulSoupCrawler instance for each crawled host. Is there any chance you could reuse the same instance? Also, if you want to ingest requests from Redis, perhaps you could implement that as a RequestLoader or RequestManager (former if you just want to read from Redis, latter if you also want Redis to handle retries and additional links).

View full answer

janbuchar · 2025-05-07T09:29:20Z

janbuchar
May 7, 2025
Maintainer

Hi @ycq0125! I noticed that you're making a new BeautifulSoupCrawler instance for each crawled host. Is there any chance you could reuse the same instance? Also, if you want to ingest requests from Redis, perhaps you could implement that as a RequestLoader or RequestManager (former if you just want to read from Redis, latter if you also want Redis to handle retries and additional links).

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

the crawler is collecting data slowly and memory usage is increasing #1190

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

the crawler is collecting data slowly and memory usage is increasing #1190

Uh oh!

ycq0125 May 6, 2025

Replies: 1 comment

Uh oh!

janbuchar May 7, 2025 Maintainer

ycq0125
May 6, 2025

janbuchar
May 7, 2025
Maintainer