futool-tiny-datahub/common/futool/http/http_downloader.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/8/31 23:34
# @Author  : old tom
# @File    : http_downloader.py
# @Project : Futool
# @Desc    : 文件下载器
import time

from futool.http.http_request import head
from multiprocessing import Pool
import urllib.request as req


class HttpDownloader(object):
    """
    HTTP 下载器
    """

    def __init__(self, pool=None):
        self.pool = Pool(16) if not pool else pool

    def download(self, url, dst, chunk_size=1000):
        """
        文件下,自动开启多线程
        :param url: 下载链接
        :param dst: 保存路径
        :param chunk_size: 文件块
        :return:
        """
        is_support, content_length = HttpDownloader.is_support_range(url)
        if is_support:
            # 每个线程下载字节偏移量
            offset = self.fork(int(content_length), chunk_size)
            self.__join(offset, url, dst)
        else:
            print('无法获取Content-Length,使用单线程下载')
        pass

    @staticmethod
    def is_support_range(url):
        """
        判断是否支持range请求
        :return:
        """
        wrapper = head(url)
        header = wrapper.header()
        h_keys = header.keys()
        if 'Accept-Ranges' in h_keys and 'Content-Length' in h_keys and header['Accept-Ranges'] != 'none':
            return True, header['Content-Length']
        else:
            return False, 0

    @staticmethod
    def fork(content_length: int, chunk_size):
        """
        拆分线程
        :param chunk_size:  文件块大小
        :param content_length:
        :return:
        """
        offset = []
        if content_length <= chunk_size:
            offset.append((0, content_length))
        else:
            for i in range(content_length // chunk_size):
                start_offset = chunk_size * i + 1
                end_offset = start_offset - 1 + chunk_size
                offset.append((0 if i == 0 else start_offset, end_offset))
            offset.append((chunk_size * (content_length // chunk_size), content_length))
        return offset

    def __join(self, offset, url, dst):
        """
        多线程下载
        :param offset:
        :param url:
        :param dst:
        :return:
        """

        def download_by_thread(part):
            _request = req.Request(url=url, headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70",
                'Range': f'bytes:{part[0]}-{part[1]}'
            }, method='GET')
            response = req.urlopen(_request)
            with open(dst + f'.{time.time_ns()}', 'wb') as f:
                f.write(response.read())

        self.pool.map(download_by_thread, offset)
        self.pool.close()
        self.pool.join()