[2024重置] Python爬虫看这篇就够了04 httpx 断点重传

  • 第一个 if
    • os.path.exists(file_path): 判断是否存在该文件, 从而确定是否需要是否需要断点重传
    • file_size: 获取文件大小, 到时候从这个地方开始重传
    • mode: 设置写入文件的方式, 是重写还是续写

需要讲解就上面这三个点了

import os
import sys
import time
from datetime import datetime

import httpx
from httpx import RemoteProtocolError
from loguru import logger
from tqdm import tqdm

logger.add(f'./log/{datetime.today().date()}.log', rotation='10 MB')

proxy_list = {
    'http://': "http://127.0.0.1:7778",
    'https://': "http://127.0.0.1:7778",
}


def download_file(url, path, filename):
    file_path = os.path.join(path, filename)
    os.makedirs(path, exist_ok=True)
    file_size = 0  # Initialize file_size to 0
    headers = {}

    retry_attempts = 5  # Number of retry attempts
    retry_delay = 5  # Delay in seconds between retries

    for attempt in range(retry_attempts):
        try:
            if os.path.exists(file_path):
                # If the file exists, determine the size and resume from where it left off
                file_size = os.path.getsize(file_path)
                headers.update({'Range': f'bytes={file_size}-'})
                mode = 'ab'  # Append to existing file
            else:
                mode = 'wb'  # Write new file

            with httpx.Client(proxies=proxy_list).stream("GET", url, headers=headers, timeout=10) as response:
                if response.status_code == 200 or response.status_code == 206:  # 206 indicates partial conten
                    # response.headers.get('Content-Disposition') # 获得推荐的文件名
                    total_size = int(response.headers.get("Content-Length", 0)) + file_size
                    block_size = 1024
                    
                    with tqdm(total=total_size, initial=file_size, unit="B", unit_scale=True) as progress_bar:
                        with open(file_path, mode) as file:
                            for data in response.iter_raw(block_size):
                                file.write(data)
                                progress_bar.update(len(data))

                    if total_size != 0 and progress_bar.n != total_size:
                        raise RuntimeError("Could not download file")
                    return True

                elif response.status_code == 416:  # OK
                    return True
                else:
                    content = response.read().decode('utf-8')
                    logger.error(f'{content}: {filename}')
                    print(response.status_code)
                    return False
        except RemoteProtocolError:
            if attempt < retry_attempts - 1:
                logger.warning(f"Attempt {attempt + 1} failed. Retrying in {retry_delay} seconds...({filename})")
                time.sleep(retry_delay)
            else:
                logger.warning(f"Download failed after {retry_attempts} attempts.({filename})")
                sys.exit(1)
                # return False


download_file(
    "https://images.unsplash.com/photo-1597776089810-2550e6d1e689?ixlib=rb-4.0.3&q=85&fm=jpg&crop=entropy&cs=srgb&dl=prescott-horn-qhixfmpqN8s-unsplash.jpg&w=1920",
    R"E:\Code\Workspace\Python\test",
    "abc.jpeg")

Licensed under CC BY-NC-SA 4.0
本博客已稳定运行
发表了53篇文章 · 总计28.17k字
使用 Hugo 构建
主题 StackJimmy 设计