diff --git a/.gitignore b/.gitignore index bd09c359d3..e2e50679dd 100644 --- a/.gitignore +++ b/.gitignore @@ -126,4 +126,7 @@ venv.bak/ dmypy.json # node -node_modules/ \ No newline at end of file +node_modules/ + +# pdm +.pdm-python diff --git a/Dockerfile b/Dockerfile index 32a80abbc1..9e5c751db6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,16 @@ -FROM debian:trixie-slim AS builder +ARG PYTHON_BASE=3.12-slim-bookworm + +FROM python:$PYTHON_BASE AS builder + +# Repository information is needed for dynamic versioning +#COPY .git/ /app/.git/ # FIXME Problems copying ~/.git for scm data + +# Source +COPY src/ /app/src +COPY pyproject.toml pdm.lock /app/ + +# Readme is needed to satisfy PyProject +COPY README.md /app/ # Build dummy packages to skip installing them and their dependencies RUN apt-get update \ @@ -14,14 +26,28 @@ RUN apt-get update \ && apt-get purge -y --auto-remove equivs \ && rm -rf /var/lib/apt/lists/* -FROM debian:trixie-slim +WORKDIR /app + +# Extract version from git +ARG REL_REF=3.3.21 +#RUN REL_REF=$(git describe --tags --abbrev=0) # FIXME Problems copying ~/.git for scm data +RUN echo "Building FlareSolverr version $REL_REF" + +# Build package +RUN pip install --upgrade pdm +ENV PDM_UPDATE_CHECK=false +ENV PDM_BUILD_SCM_VERSION=$REL_REF +RUN pdm install --check --prod --no-editable --verbose + +FROM python:$PYTHON_BASE # Copy dummy packages COPY --from=builder /libgl1-mesa-dri.deb /adwaita-icon-theme.deb / +COPY --from=builder /app/.venv /app/.venv +COPY --from=builder /app/src /app/src # Install dependencies and create flaresolverr user WORKDIR /app -COPY requirements.txt . RUN apt-get update \ # Install dummy packages && dpkg -i /libgl1-mesa-dri.deb \ @@ -29,7 +55,7 @@ RUN apt-get update \ && apt-get install -f \ # Install dependencies && apt-get install -y --no-install-recommends chromium xvfb dumb-init \ - procps curl vim xauth python3 python3-pip \ + procps curl vim xauth python3 \ # Remove temporary files and hardware decoding libraries && rm -rf /var/lib/apt/lists/* \ && rm -f /usr/lib/x86_64-linux-gnu/libmfxhw* \ @@ -37,9 +63,6 @@ RUN apt-get update \ # Create flaresolverr user && useradd --home-dir /app --shell /bin/sh flaresolverr \ && chown -R flaresolverr:flaresolverr . \ - # Set up Python and install dependencies - && ln -s /usr/bin/python3 /usr/local/bin/python \ - && pip install --break-system-packages -r requirements.txt \ # Remove temporary files && rm -rf /root/.cache /tmp/* @@ -47,16 +70,14 @@ USER flaresolverr RUN mkdir -p "/app/.config/chromium/Crash Reports/pending" -COPY src . -COPY package.json ../ - EXPOSE 8191 EXPOSE 8192 # dumb-init avoids zombie chromium processes ENTRYPOINT ["/usr/bin/dumb-init", "--"] -CMD ["/usr/local/bin/python", "-u", "/app/flaresolverr.py"] +ENV PATH="/app/.venv/bin:$PATH" +CMD ["python", "-um", "flaresolverr"] # Local build # docker build -t ngosang/flaresolverr:3.3.21 . diff --git a/LICENSE b/LICENSE index 8f33130f55..e93d231633 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,65 @@ -MIT License - +Files: + * Copyright (c) 2023 Diego Heras (ngosang / ngosang@hotmail.es) +License: MIT + + +Files: + src/flaresolverr/DataRecorder/* +Copyright (c) 2021, g1879 +License: MIT + + +Files: + src/flaresolverr/DrissionPage/* + src/flaresolverr/DownloadKit/* +Copyright (c) 2020, g1879 +All rights reserved. +License: BSD-3-Clause + + +License: MIT + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + +License: BSD-3-Clause + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pdm.lock b/pdm.lock new file mode 100644 index 0000000000..d9f1759d7c --- /dev/null +++ b/pdm.lock @@ -0,0 +1,452 @@ +# This file is @generated by PDM. +# It is not intended for manual editing. + +[metadata] +groups = ["default", "test"] +strategy = ["inherit_metadata"] +lock_version = "4.5.0" +content_hash = "sha256:338a842603ec7aba17d3108821dbf0b208f8903e80d90b8ef0178806696dee02" + +[[metadata.targets]] +requires_python = ">=3.11" + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +requires_python = ">=3.6.0" +summary = "Screen-scraping library" +groups = ["test"] +dependencies = [ + "soupsieve>1.2", +] +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[[package]] +name = "bottle" +version = "0.12.25" +summary = "Fast and simple WSGI-framework for small web-applications." +groups = ["default"] +files = [ + {file = "bottle-0.12.25-py3-none-any.whl", hash = "sha256:d6f15f9d422670b7c073d63bd8d287b135388da187a0f3e3c19293626ce034ea"}, + {file = "bottle-0.12.25.tar.gz", hash = "sha256:e1a9c94970ae6d710b3fb4526294dfeb86f2cb4a81eff3a4b98dc40fb0e5e021"}, +] + +[[package]] +name = "certifi" +version = "2024.7.4" +requires_python = ">=3.6" +summary = "Python package for providing Mozilla's CA Bundle." +groups = ["default"] +files = [ + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.2" +requires_python = ">=3.7.0" +summary = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +groups = ["default"] +files = [ + {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, + {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, +] + +[[package]] +name = "click" +version = "8.1.7" +requires_python = ">=3.7" +summary = "Composable command line interface toolkit" +groups = ["default"] +dependencies = [ + "colorama; platform_system == \"Windows\"", + "importlib-metadata; python_version < \"3.8\"", +] +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +summary = "Cross-platform colored terminal text." +groups = ["default"] +marker = "platform_system == \"Windows\"" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "cssselect" +version = "1.2.0" +requires_python = ">=3.7" +summary = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +groups = ["default"] +files = [ + {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"}, + {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"}, +] + +[[package]] +name = "datarecorder" +version = "3.5.3" +requires_python = ">=3.6" +summary = "用于记录数据的模块。" +groups = ["default"] +dependencies = [ + "openpyxl", +] +files = [ + {file = "DataRecorder-3.5.3-py3-none-any.whl", hash = "sha256:79072f9e7936d518c1c5e9fd076dd7228e931aa02d11b19779f816e54ff7823e"}, + {file = "DataRecorder-3.5.3.tar.gz", hash = "sha256:011f566e318507c7d804b69c354c08d0159bf112a08521d42bcd0144c1d3c3c3"}, +] + +[[package]] +name = "downloadkit" +version = "2.0.2" +requires_python = ">=3.6" +summary = "一个简洁易用的多线程文件下载工具。" +groups = ["default"] +dependencies = [ + "DataRecorder>=3.4.11", + "requests", +] +files = [ + {file = "DownloadKit-2.0.2-py3-none-any.whl", hash = "sha256:b722ac1d69a1d8f23f2fc5dd77c1c1aa105f9f10508fae3de680ad319a96200d"}, + {file = "DownloadKit-2.0.2.tar.gz", hash = "sha256:7b966e599684191e2c4f48c106a76f94ca2ba9c47a42c690f4636b67f0815983"}, +] + +[[package]] +name = "et-xmlfile" +version = "1.1.0" +requires_python = ">=3.6" +summary = "An implementation of lxml.xmlfile for the standard library" +groups = ["default"] +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + +[[package]] +name = "filelock" +version = "3.15.4" +requires_python = ">=3.8" +summary = "A platform independent file lock." +groups = ["default"] +files = [ + {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, + {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, +] + +[[package]] +name = "func-timeout" +version = "4.3.5" +summary = "Python module which allows you to specify timeouts when calling any existing function. Also provides support for stoppable-threads" +groups = ["default"] +files = [ + {file = "func_timeout-4.3.5.tar.gz", hash = "sha256:74cd3c428ec94f4edfba81f9b2f14904846d5ffccc27c92433b8b5939b5575dd"}, +] + +[[package]] +name = "idna" +version = "3.7" +requires_python = ">=3.5" +summary = "Internationalized Domain Names in Applications (IDNA)" +groups = ["default"] +files = [ + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, +] + +[[package]] +name = "lxml" +version = "5.2.2" +requires_python = ">=3.6" +summary = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +groups = ["default"] +files = [ + {file = "lxml-5.2.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:45f9494613160d0405682f9eee781c7e6d1bf45f819654eb249f8f46a2c22545"}, + {file = "lxml-5.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0b3f2df149efb242cee2ffdeb6674b7f30d23c9a7af26595099afaf46ef4e88"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d28cb356f119a437cc58a13f8135ab8a4c8ece18159eb9194b0d269ec4e28083"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:657a972f46bbefdbba2d4f14413c0d079f9ae243bd68193cb5061b9732fa54c1"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b9ea10063efb77a965a8d5f4182806fbf59ed068b3c3fd6f30d2ac7bee734"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07542787f86112d46d07d4f3c4e7c760282011b354d012dc4141cc12a68cef5f"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:303f540ad2dddd35b92415b74b900c749ec2010e703ab3bfd6660979d01fd4ed"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2eb2227ce1ff998faf0cd7fe85bbf086aa41dfc5af3b1d80867ecfe75fb68df3"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:1d8a701774dfc42a2f0b8ccdfe7dbc140500d1049e0632a611985d943fcf12df"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:56793b7a1a091a7c286b5f4aa1fe4ae5d1446fe742d00cdf2ffb1077865db10d"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eb00b549b13bd6d884c863554566095bf6fa9c3cecb2e7b399c4bc7904cb33b5"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a2569a1f15ae6c8c64108a2cd2b4a858fc1e13d25846be0666fc144715e32ab"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:8cf85a6e40ff1f37fe0f25719aadf443686b1ac7652593dc53c7ef9b8492b115"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d237ba6664b8e60fd90b8549a149a74fcc675272e0e95539a00522e4ca688b04"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0b3f5016e00ae7630a4b83d0868fca1e3d494c78a75b1c7252606a3a1c5fc2ad"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23441e2b5339bc54dc949e9e675fa35efe858108404ef9aa92f0456929ef6fe8"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2fb0ba3e8566548d6c8e7dd82a8229ff47bd8fb8c2da237607ac8e5a1b8312e5"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:79d1fb9252e7e2cfe4de6e9a6610c7cbb99b9708e2c3e29057f487de5a9eaefa"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6dcc3d17eac1df7859ae01202e9bb11ffa8c98949dcbeb1069c8b9a75917e01b"}, + {file = "lxml-5.2.2-cp311-cp311-win32.whl", hash = "sha256:4c30a2f83677876465f44c018830f608fa3c6a8a466eb223535035fbc16f3438"}, + {file = "lxml-5.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:49095a38eb333aaf44c06052fd2ec3b8f23e19747ca7ec6f6c954ffea6dbf7be"}, + {file = "lxml-5.2.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7429e7faa1a60cad26ae4227f4dd0459efde239e494c7312624ce228e04f6391"}, + {file = "lxml-5.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:50ccb5d355961c0f12f6cf24b7187dbabd5433f29e15147a67995474f27d1776"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc911208b18842a3a57266d8e51fc3cfaccee90a5351b92079beed912a7914c2"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33ce9e786753743159799fdf8e92a5da351158c4bfb6f2db0bf31e7892a1feb5"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec87c44f619380878bd49ca109669c9f221d9ae6883a5bcb3616785fa8f94c97"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08ea0f606808354eb8f2dfaac095963cb25d9d28e27edcc375d7b30ab01abbf6"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75a9632f1d4f698b2e6e2e1ada40e71f369b15d69baddb8968dcc8e683839b18"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74da9f97daec6928567b48c90ea2c82a106b2d500f397eeb8941e47d30b1ca85"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:0969e92af09c5687d769731e3f39ed62427cc72176cebb54b7a9d52cc4fa3b73"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:9164361769b6ca7769079f4d426a41df6164879f7f3568be9086e15baca61466"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d26a618ae1766279f2660aca0081b2220aca6bd1aa06b2cf73f07383faf48927"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab67ed772c584b7ef2379797bf14b82df9aa5f7438c5b9a09624dd834c1c1aaf"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3d1e35572a56941b32c239774d7e9ad724074d37f90c7a7d499ab98761bd80cf"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:8268cbcd48c5375f46e000adb1390572c98879eb4f77910c6053d25cc3ac2c67"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e282aedd63c639c07c3857097fc0e236f984ceb4089a8b284da1c526491e3f3d"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfdc2bfe69e9adf0df4915949c22a25b39d175d599bf98e7ddf620a13678585"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4aefd911793b5d2d7a921233a54c90329bf3d4a6817dc465f12ffdfe4fc7b8fe"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:8b8df03a9e995b6211dafa63b32f9d405881518ff1ddd775db4e7b98fb545e1c"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f11ae142f3a322d44513de1018b50f474f8f736bc3cd91d969f464b5bfef8836"}, + {file = "lxml-5.2.2-cp312-cp312-win32.whl", hash = "sha256:16a8326e51fcdffc886294c1e70b11ddccec836516a343f9ed0f82aac043c24a"}, + {file = "lxml-5.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:bbc4b80af581e18568ff07f6395c02114d05f4865c2812a1f02f2eaecf0bfd48"}, + {file = "lxml-5.2.2.tar.gz", hash = "sha256:bb2dc4898180bea79863d5487e5f9c7c34297414bad54bcd0f0852aee9cfdb87"}, +] + +[[package]] +name = "openpyxl" +version = "3.1.5" +requires_python = ">=3.8" +summary = "A Python library to read/write Excel 2010 xlsx/xlsm files" +groups = ["default"] +dependencies = [ + "et-xmlfile", +] +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[[package]] +name = "pefile" +version = "2023.2.7" +requires_python = ">=3.6.0" +summary = "Python PE parsing module" +groups = ["default"] +marker = "platform_system == \"Windows\"" +files = [ + {file = "pefile-2023.2.7-py3-none-any.whl", hash = "sha256:da185cd2af68c08a6cd4481f7325ed600a88f6a813bad9dea07ab3ef73d8d8d6"}, + {file = "pefile-2023.2.7.tar.gz", hash = "sha256:82e6114004b3d6911c77c3953e3838654b04511b8b66e8583db70c65998017dc"}, +] + +[[package]] +name = "prometheus-client" +version = "0.17.1" +requires_python = ">=3.6" +summary = "Python client for the Prometheus monitoring system." +groups = ["default"] +files = [ + {file = "prometheus_client-0.17.1-py3-none-any.whl", hash = "sha256:e537f37160f6807b8202a6fc4764cdd19bac5480ddd3e0d463c3002b34462101"}, + {file = "prometheus_client-0.17.1.tar.gz", hash = "sha256:21e674f39831ae3f8acde238afd9a27a37d0d2fb5a28ea094f0ce25d2cbf2091"}, +] + +[[package]] +name = "psutil" +version = "6.0.0" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +summary = "Cross-platform lib for process and system monitoring in Python." +groups = ["default"] +files = [ + {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"}, + {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"}, + {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"}, + {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"}, + {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"}, + {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"}, + {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"}, + {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"}, +] + +[[package]] +name = "requests" +version = "2.32.0" +requires_python = ">=3.8" +summary = "Python HTTP for Humans." +groups = ["default"] +dependencies = [ + "certifi>=2017.4.17", + "charset-normalizer<4,>=2", + "idna<4,>=2.5", + "urllib3<3,>=1.21.1", +] +files = [ + {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, + {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, +] + +[[package]] +name = "requests-file" +version = "2.1.0" +summary = "File transport adapter for Requests" +groups = ["default"] +dependencies = [ + "requests>=1.0.0", +] +files = [ + {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, + {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, +] + +[[package]] +name = "soupsieve" +version = "2.5" +requires_python = ">=3.8" +summary = "A modern CSS selector implementation for Beautiful Soup." +groups = ["test"] +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + +[[package]] +name = "tldextract" +version = "5.1.2" +requires_python = ">=3.8" +summary = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." +groups = ["default"] +dependencies = [ + "filelock>=3.0.8", + "idna", + "requests-file>=1.4", + "requests>=2.1.0", +] +files = [ + {file = "tldextract-5.1.2-py3-none-any.whl", hash = "sha256:4dfc4c277b6b97fa053899fcdb892d2dc27295851ab5fac4e07797b6a21b2e46"}, + {file = "tldextract-5.1.2.tar.gz", hash = "sha256:c9e17f756f05afb5abac04fe8f766e7e70f9fe387adb1859f0f52408ee060200"}, +] + +[[package]] +name = "urllib3" +version = "2.2.2" +requires_python = ">=3.8" +summary = "HTTP library with thread-safe connection pooling, file post, and more." +groups = ["default"] +files = [ + {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, + {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, +] + +[[package]] +name = "waitress" +version = "2.1.2" +requires_python = ">=3.7.0" +summary = "Waitress WSGI server" +groups = ["default", "test"] +files = [ + {file = "waitress-2.1.2-py3-none-any.whl", hash = "sha256:7500c9625927c8ec60f54377d590f67b30c8e70ef4b8894214ac6e4cad233d2a"}, + {file = "waitress-2.1.2.tar.gz", hash = "sha256:780a4082c5fbc0fde6a2fcfe5e26e6efc1e8f425730863c04085769781f51eba"}, +] + +[[package]] +name = "webob" +version = "1.8.7" +requires_python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*" +summary = "WSGI request and response object" +groups = ["test"] +files = [ + {file = "WebOb-1.8.7-py2.py3-none-any.whl", hash = "sha256:73aae30359291c14fa3b956f8b5ca31960e420c28c1bec002547fb04928cf89b"}, + {file = "WebOb-1.8.7.tar.gz", hash = "sha256:b64ef5141be559cfade448f044fa45c2260351edcb6a8ef6b7e00c7dcef0c323"}, +] + +[[package]] +name = "websocket-client" +version = "1.8.0" +requires_python = ">=3.8" +summary = "WebSocket client for Python with low level API options" +groups = ["default"] +files = [ + {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, + {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, +] + +[[package]] +name = "websockets" +version = "11.0.3" +requires_python = ">=3.7" +summary = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +groups = ["default"] +files = [ + {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb"}, + {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288"}, + {file = "websockets-11.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d"}, + {file = "websockets-11.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3"}, + {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b"}, + {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6"}, + {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97"}, + {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf"}, + {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd"}, + {file = "websockets-11.0.3-cp311-cp311-win32.whl", hash = "sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c"}, + {file = "websockets-11.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8"}, + {file = "websockets-11.0.3-py3-none-any.whl", hash = "sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6"}, + {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"}, +] + +[[package]] +name = "webtest" +version = "3.0.0" +requires_python = ">=3.6, <4" +summary = "Helper to test WSGI applications" +groups = ["test"] +dependencies = [ + "WebOb>=1.2", + "beautifulsoup4", + "waitress>=0.8.5", +] +files = [ + {file = "WebTest-3.0.0-py3-none-any.whl", hash = "sha256:2a001a9efa40d2a7e5d9cd8d1527c75f41814eb6afce2c3d207402547b1e5ead"}, + {file = "WebTest-3.0.0.tar.gz", hash = "sha256:54bd969725838d9861a9fa27f8d971f79d275d94ae255f5c501f53bb6d9929eb"}, +] + +[[package]] +name = "xvfbwrapper" +version = "0.2.9" +summary = "run headless display inside X virtual framebuffer (Xvfb)" +groups = ["default"] +marker = "platform_system != \"Windows\"" +files = [ + {file = "xvfbwrapper-0.2.9.tar.gz", hash = "sha256:bcf4ae571941b40254faf7a73432dfc119ad21ce688f1fdec533067037ecfc24"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..a8ef9d60ae --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,66 @@ +[project] +name = "FlareSolverr" +dynamic = ["version"] +description = "Proxy server to bypass Cloudflare protection" +authors = [ + {name = "Diego Heras", email = "ngosang@hotmail.es"}, +] +maintainers = [ + {name = "Paul Pfeister", email = "code@pfeister.dev"}, +] +dependencies = [ + "bottle==0.12.25", + "waitress==2.1.2", + "func-timeout==4.3.5", + "prometheus-client==0.17.1", + "requests==2.32.0", # required by undetected_chromedriver, DrissionPage + "certifi==2024.07.04", # required by undetected_chromedriver + "websockets==11.0.3", # required by undetected_chromedriver + "xvfbwrapper==0.2.9; platform_system != \"Windows\"", + "pefile==2023.2.7; platform_system == \"Windows\"", + "lxml", # required by DrissionPage + "cssselect", # required by DrissionPage + "DownloadKit>=2.0.0", # required by DrissionPage + "websocket-client", # required by DrissionPage + "click", # required by DrissionPage + "tldextract", # required by DrissionPage + "psutil", # required by DrissionPage + "openpyxl>=3.1.5", # required by DrissionPage (required by DataRecorder) +] +requires-python = ">=3.11" +readme = "README.md" +license = {text = "MIT"} +keywords = ["proxy", "cloudflare", "ddos-guard", "captcha", "security"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +test = ["WebTest==3.0.0"] +[tool.pdm.dev-dependencies] +test = [ + "WebTest==3.0.0", +] + +[project.urls] +homepage = "https://github.com/FlareSolverr/FlareSolverr/" +bug-tracker = "https://github.com/FlareSolverr/FlareSolverr/issues/" +documentation = "https://github.com/FlareSolverr/FlareSolverr/wiki/" + +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + +[tool.pdm] +distribution = true + +[tool.pdm.version] +source = "scm" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 204c344793..0000000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -bottle==0.12.25 -waitress==2.1.2 -DrissionPage==4.1.0.0b14 -func-timeout==4.3.5 -prometheus-client==0.17.1 -# required by undetected_chromedriver -requests==2.32.0 -certifi==2024.07.04 -websockets==11.0.3 -# only required for linux and macos -xvfbwrapper==0.2.9; platform_system != "Windows" -# only required for windows -pefile==2023.2.7; platform_system == "Windows" diff --git a/src/flaresolverr/DataRecorder/__init__.py b/src/flaresolverr/DataRecorder/__init__.py new file mode 100644 index 0000000000..6155521fb9 --- /dev/null +++ b/src/flaresolverr/DataRecorder/__init__.py @@ -0,0 +1,6 @@ +from .byte_recorder import ByteRecorder +from .db_recorder import DBRecorder +from .filler import Filler +from .recorder import Recorder + +__version__ = '3.5.3' diff --git a/src/flaresolverr/DataRecorder/base.py b/src/flaresolverr/DataRecorder/base.py new file mode 100644 index 0000000000..2ed1addfd6 --- /dev/null +++ b/src/flaresolverr/DataRecorder/base.py @@ -0,0 +1,194 @@ +# -*- coding:utf-8 -*- +from abc import abstractmethod +from pathlib import Path +from threading import Lock +from time import sleep + +from .setter import OriginalSetter, BaseSetter +from .tools import get_usable_path + + +class OriginalRecorder(object): + """记录器的基类""" + _SUPPORTS = ('any',) + + def __init__(self, path=None, cache_size=None): + """ + :param path: 保存的文件路径 + :param cache_size: 每接收多少条记录写入文件,0为不自动写入 + """ + self._data = None + self._style_data = None + self._path = None + self._type = None + self._lock = Lock() + self._pause_add = False # 文件写入时暂停接收输入 + self._pause_write = False # 标记文件正在被一个线程写入 + self.show_msg = True + self._setter = None + self._data_count = 0 # 已缓存数据的条数 + self._file_exists = False + + if path: + self.set.path(path) + self._cache = cache_size if cache_size is not None else 1000 + + def __del__(self): + """对象关闭时把剩下的数据写入文件""" + self.record() + + @property + def set(self): + """返回用于设置属性的对象""" + if self._setter is None: + self._setter = OriginalSetter(self) + return self._setter + + @property + def cache_size(self): + """返回缓存大小""" + return self._cache + + @property + def path(self): + """返回文件路径""" + return self._path + + @property + def type(self): + """返回文件类型""" + return self._type + + @property + def data(self): + """返回当前保存在缓存的数据""" + return self._data + + def record(self, new_path=None): + """记录数据,可保存到新文件 + :param new_path: 文件另存为的路径,会保存新文件 + :return: 文件路径 + """ + # 具体功能由_record()实现,本方法实现自动重试及另存文件功能 + original_path = return_path = self._path + if new_path: + new_path = str(get_usable_path(new_path)) + return_path = self._path = new_path + + if Path(original_path).exists(): + from shutil import copy + copy(original_path, self._path) + + if not self._data and not self._style_data: + return return_path + + if not self._path: + raise ValueError('保存路径为空。') + + with self._lock: + self._pause_add = True # 写入文件前暂缓接收数据 + if self.show_msg: + print(f'{self.path} 开始写入文件,切勿关闭进程。') + + Path(self.path).parent.mkdir(parents=True, exist_ok=True) + while True: + try: + while self._pause_write: # 等待其它线程写入结束 + sleep(.1) + + self._pause_write = True + self._record() + break + + except PermissionError: + if self.show_msg: + print('\r文件被打开,保存失败,请关闭,程序会自动重试。', end='') + + except Exception as e: + try: + with open('failed_data.txt', 'a+', encoding='utf-8') as f: + f.write(str(self.data) + '\n') + print('保存失败的数据已保存到failed_data.txt。') + except: + raise e + raise + + finally: + self._pause_write = False + + sleep(.3) + + if new_path: + self._path = original_path + + if self.show_msg: + print(f'{self.path} 写入文件结束。') + self.clear() + self._data_count = 0 + self._pause_add = False + + return return_path + + def clear(self): + """清空缓存中的数据""" + if self._data: + self._data.clear() + + @abstractmethod + def add_data(self, data): + pass + + @abstractmethod + def _record(self): + pass + + +class BaseRecorder(OriginalRecorder): + """Recorder、Filler和DBRecorder的父类""" + _SUPPORTS = ('xlsx', 'csv') + + def __init__(self, path=None, cache_size=None): + """ + :param path: 保存的文件路径 + :param cache_size: 每接收多少条记录写入文件,0为不自动写入 + """ + super().__init__(path, cache_size) + self._before = [] + self._after = [] + self._encoding = 'utf-8' + self._table = None + + @property + def set(self): + """返回用于设置属性的对象""" + if self._setter is None: + self._setter = BaseSetter(self) + return self._setter + + @property + def before(self): + """返回当前before内容""" + return self._before + + @property + def after(self): + """返回当前after内容""" + return self._after + + @property + def table(self): + """返回默认表名""" + return self._table + + @property + def encoding(self): + """返回编码格式""" + return self._encoding + + @abstractmethod + def add_data(self, data, table=None): + pass + + @abstractmethod + def _record(self): + pass diff --git a/src/flaresolverr/DataRecorder/base.pyi b/src/flaresolverr/DataRecorder/base.pyi new file mode 100644 index 0000000000..fe0be59e1d --- /dev/null +++ b/src/flaresolverr/DataRecorder/base.pyi @@ -0,0 +1,83 @@ +# -*- coding:utf-8 -*- +from abc import abstractmethod +from pathlib import Path +from threading import Lock +from typing import Union, Any, Optional + +from .setter import OriginalSetter, BaseSetter + + +class OriginalRecorder(object): + _SUPPORTS: tuple = ... + _cache: int = ... + _path: Optional[str] = ... + _type: Optional[str] = ... + _data: Union[list, dict, None] = ... + _style_data: Union[list, dict, None] = ... + _lock: Lock = ... + _pause_add: bool = ... + _pause_write: bool = ... + show_msg: bool = ... + _setter: Optional[OriginalSetter] = ... + _data_count: int = ... + _file_exists: bool = ... + + def __init__(self, + path: Union[str, Path, None] = None, + cache_size: int = None) -> None: ... + + def __del__(self) -> None: ... + + @property + def set(self) -> OriginalSetter: ... + + @property + def cache_size(self) -> int: ... + + @property + def path(self) -> str: ... + + @property + def type(self) -> str: ... + + @property + def data(self) -> Union[dict, list]: ... + + def record(self, new_path: Union[None, str, Path] = None) -> str: ... + + def clear(self) -> None: ... + + @abstractmethod + def add_data(self, data): ... + + @abstractmethod + def _record(self): ... + + +class BaseRecorder(OriginalRecorder): + _SUPPORTS: tuple = ... + _encoding: str = ... + _before: list = ... + _after: list = ... + _table: Optional[str] = ... + _setter: BaseSetter = ... + + def __init__(self, path: Union[None, str, Path] = None, cache_size: int = None) -> None: ... + + @property + def set(self) -> BaseSetter: ... + + @property + def before(self) -> Any: ... + + @property + def after(self) -> Any: ... + + @property + def table(self) -> Optional[str]: ... + + @property + def encoding(self) -> str: ... + + @abstractmethod + def _record(self): ... diff --git a/src/flaresolverr/DataRecorder/byte_recorder.py b/src/flaresolverr/DataRecorder/byte_recorder.py new file mode 100644 index 0000000000..05037943b8 --- /dev/null +++ b/src/flaresolverr/DataRecorder/byte_recorder.py @@ -0,0 +1,53 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from time import sleep + +from .base import OriginalRecorder + + +class ByteRecorder(OriginalRecorder): + _SUPPORTS = ('any',) + __END = (0, 2) + + def __init__(self, path=None, cache_size=None): + """用于记录字节数据的工具 + :param path: 保存的文件路径 + :param cache_size: 每接收多少条记录写入文件,0为不自动写入 + """ + super().__init__(path, cache_size) + + def add_data(self, data, seek=None): + """添加一段二进制数据 + :param data: bytes类型数据 + :param seek: 在文件中的位置,None表示最后 + :return: None + """ + while self._pause_add: # 等待其它线程写入结束 + sleep(.1) + + if not isinstance(data, bytes): + raise TypeError('只能接受bytes类型数据。') + if seek is not None and not (isinstance(seek, int) and seek >= 0): + raise ValueError('seek参数只能接受None或大于等于0的整数。') + + self._data.append((data, seek)) + self._data_count += 1 + + if 0 < self.cache_size <= self._data_count: + self.record() + + def _record(self): + """记录数据到文件""" + if not self._file_exists and not Path(self.path).exists(): + with open(self.path, 'w'): + pass + self._file_exists = True + + with open(self.path, 'rb+') as f: + previous = None + for i in self._data: + loc = ByteRecorder.__END if i[1] is None else (i[1], 0) + if not (previous == loc == ByteRecorder.__END): + f.seek(loc[0], loc[1]) + previous = loc + f.write(i[0]) diff --git a/src/flaresolverr/DataRecorder/byte_recorder.pyi b/src/flaresolverr/DataRecorder/byte_recorder.pyi new file mode 100644 index 0000000000..28e22f8ea0 --- /dev/null +++ b/src/flaresolverr/DataRecorder/byte_recorder.pyi @@ -0,0 +1,22 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from typing import Union, Optional + +from .base import OriginalRecorder + + +class ByteRecorder(OriginalRecorder): + _SUPPORTS: tuple = ... + __END: tuple = ... + _data: list = ... + data: list = ... + + def __init__(self, + path: Union[None, str, Path] = None, + cache_size: int = None): ... + + def add_data(self, + data: bytes, + seek: int = None) -> None: ... + + def _record(self) -> None: ... diff --git a/src/flaresolverr/DataRecorder/db_recorder.py b/src/flaresolverr/DataRecorder/db_recorder.py new file mode 100644 index 0000000000..a61edf886c --- /dev/null +++ b/src/flaresolverr/DataRecorder/db_recorder.py @@ -0,0 +1,177 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from sqlite3 import connect +from time import sleep + +from .base import BaseRecorder +from .setter import DBSetter +from .tools import data_to_list_or_dict, ok_list, data_to_list_or_dict_simplify + + +class DBRecorder(BaseRecorder): + _SUPPORTS = ('db',) + + def __init__(self, path=None, cache_size=None, table=None): + """用于存储数据到sqlite的工具 + :param path: 保存的文件路径 + :param cache_size: 每接收多少条记录写入文件,0为不自动写入 + :param table: 默认表名 + """ + self._conn = None + self._cur = None + self._type = 'db' + super().__init__(None, cache_size) + if path: + self.set.path(path, table) + + @property + def set(self): + """返回用于设置属性的对象""" + if self._setter is None: + self._setter = DBSetter(self) + return self._setter + + def add_data(self, data, table=None): + """添加数据 + :param data: 可以是一维或二维数据,dict格式可向对应列填写数据,其余格式按顺序从左到右填入各列 + :param table: 数据要插入的表名称 + :return: None + """ + while self._pause_add: # 等待其它线程写入结束 + sleep(.1) + + table = table or self.table + if not isinstance(table, str): + raise RuntimeError('未指定数据库表名。') + + if not isinstance(data, (list, tuple, dict)): + data = (data,) + + if not data: + self._data.setdefault(table, []).append(tuple()) + self._data_count += 1 + + # 一维数组 + elif isinstance(data, dict) or ( + isinstance(data, (list, tuple)) and not isinstance(data[0], (list, tuple, dict))): + self._data.setdefault(table, []).append(data_to_list_or_dict(self, data)) + self._data_count += 1 + + else: # 二维数组 + if self.after or self.before: + dd = [data_to_list_or_dict(self, d) for d in data] + else: + dd = [data_to_list_or_dict_simplify(d) for d in data] + self._data.setdefault(table, []).extend(dd) + self._data_count += len(data) + + if 0 < self.cache_size <= self._data_count: + self.record() + + def run_sql(self, sql, single=True, commit=False): + """执行sql语句并返回结果 + :param sql: sql语句 + :param single: 是否只获取一个结果 + :param commit: 是否提交到数据库 + :return: 查找到的结果,没有结果时返回None + """ + self._connect() + self._cur.execute(sql) + r = self._cur.fetchone() if single else self._cur.fetchall() + if commit: + self._conn.commit() + self._close_connection() + return r + + def _connect(self): + """连接数据库""" + Path(self.path).parent.mkdir(parents=True, exist_ok=True) + self._conn = connect(self.path) + self._cur = self._conn.cursor() + + def _close_connection(self): + """关闭数据库 """ + if self._conn is not None: + try: + self._cur.close() + self._conn.close() + except: + pass + + def _to_database(self, data_list, table, tables): + """把数据批量写入指定数据表 + :param data_list: 要写入的数据组成的列表 + :param table: 要写入数据的数据表名称 + :param tables: 数据库中数据表和列信息 + :return: None + """ + if isinstance(data_list[0], dict): # 检查是否要新增列 + keys = data_list[0].keys() + for key in keys: + if key not in tables[table]: + sql = f'ALTER TABLE {table} ADD COLUMN {key}' + self._cur.execute(sql) + tables[table].append(key) + + question_masks = ','.join('?' * len(keys)) + keys_txt = ','.join(keys) + values = [ok_list(i.values()) for i in data_list] + sql = f'INSERT INTO {table} ({keys_txt}) values ({question_masks})' + + else: + question_masks = ','.join('?' * len(tables[table])) + values = data_list + sql = f'INSERT INTO {table} values ({question_masks})' + + self._cur.executemany(sql, values) + + def _record(self): + """保存数据到sqlite""" + # 获取所有表名和列名 + self._connect() + self._cur.execute("select name from sqlite_master where type='table'") + tables = {} + for table in self._cur.fetchall(): + self._cur.execute(f"PRAGMA table_info({table[0]})") + tables[table[0]] = [i[1] for i in self._cur.fetchall()] + + for table, data in self._data.items(): + data_list = [] + if isinstance(data[0], dict): + curr_keys = data[0].keys() + else: + curr_keys = len(data[0]) + + for d in data: + if isinstance(d, dict): + tmp_keys = d.keys() + if table not in tables: + keys = d.keys() + self._cur.execute(f"CREATE TABLE {table} ({','.join(keys)})") + tables[table] = tuple(keys) + + else: + if table not in tables: + self._close_connection() + raise TypeError('新建表格首次须接收数据需为dict格式。') + tmp_keys = len(d) + long = len(tables[table]) + if long > tmp_keys: + d = ok_list(d) + d.extend([None] * (long - tmp_keys)) + elif long < tmp_keys: + self._close_connection() + raise RuntimeError('数据个数大于列数(注意before和after属性)。') + + if tmp_keys != curr_keys: + self._to_database(data_list, table, tables) + curr_keys = tmp_keys + data_list = [] + + data_list.append(d) + + if data_list: + self._to_database(data_list, table, tables) + + self._conn.commit() + self._close_connection() diff --git a/src/flaresolverr/DataRecorder/db_recorder.pyi b/src/flaresolverr/DataRecorder/db_recorder.pyi new file mode 100644 index 0000000000..a5d376bc68 --- /dev/null +++ b/src/flaresolverr/DataRecorder/db_recorder.pyi @@ -0,0 +1,35 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from sqlite3 import Connection, Cursor +from typing import Union, Any, Optional + +from .base import BaseRecorder +from .setter import DBSetter + + +class DBRecorder(BaseRecorder): + _conn: Optional[Connection] = ... + _cur: Optional[Cursor] = ... + _setter: Optional[DBSetter] = ... + _data: dict = ... + data: dict = ... + + def __init__(self, + path: Union[str, Path] = None, + cache_size: int = None, + table: str = None): ... + + @property + def set(self) -> DBSetter: ... + + def add_data(self, data: Any, table: str = None) -> None: ... + + def run_sql(self, sql: str, single: bool = True, commit: bool = False) -> Union[None, list, tuple]: ... + + def _connect(self) -> None: ... + + def _close_connection(self) -> None: ... + + def _record(self) -> None: ... + + def _to_database(self, data_list: list, table: str, tables: dict) -> None: ... diff --git a/src/flaresolverr/DataRecorder/filler.py b/src/flaresolverr/DataRecorder/filler.py new file mode 100644 index 0000000000..64a85f9dde --- /dev/null +++ b/src/flaresolverr/DataRecorder/filler.py @@ -0,0 +1,576 @@ +# -*- coding:utf-8 -*- +from csv import reader as csv_reader, writer as csv_writer +from pathlib import Path +from time import sleep + +from openpyxl import load_workbook +from openpyxl.utils import get_column_letter + +from .base import BaseRecorder +from .setter import FillerSetter +from .style.cell_style import CellStyle, NoneStyle +from .tools import (parse_coord, get_usable_coord, process_content, data_to_list_or_dict, ok_list, get_usable_coord_int, + data_to_list_or_dict_simplify, get_csv_head, get_wb, get_ws, get_xlsx_head, create_csv, FillerDict, + FillerList) + + +class Filler(BaseRecorder): + def __init__(self, path=None, cache_size=None, key_cols=True, begin_row=2, + sign_col=True, data_col=None, sign=None, deny_sign=False): + """用于处理表格文件的工具 + :param path: 保存的文件路径 + :param cache_size: 每接收多少条记录写入文件,传入0表示不自动保存 + :param key_cols: 作为关键字的列,可以是多列,可以是列编号或序号,从1开始,True表示获取整行 + :param begin_row: 数据开始的行,默认表头一行 + :param sign_col: 用于判断是否已填数据的列,可以是列编号或序号,从1开始,True表示获取所有行,不进行判断 + :param data_col: 要填入数据的第一列,从1开始 + :param sign: 按这个值筛选需要的行纳入keys + :param deny_sign: 是否反向匹配sign,即筛选sign_col列值不是sign的行 + """ + super().__init__(None, cache_size) + self._delimiter = ',' # csv文件分隔符 + self._quote_char = '"' # csv文件引用符 + self._key_cols = None + self._begin_row = None + self._sign_col = None + self._data_col = None + self._sign = None + self._deny_sign = False + self._fit_head = False + self._style_data = {} + if not data_col: + data_col = sign_col if sign_col else 1 + self.set.path(path, key_cols, begin_row, sign_col, data_col, sign, deny_sign) + s = CellStyle() + s.font.set_color("0000FF") + s.font.set_underline('single') + self._link_style = s + + @property + def sign(self): + """按这个值筛选需要的行纳入keys""" + return self._sign + + @property + def deny_sign(self): + """返回是否反向匹配sign""" + return self._deny_sign + + @property + def key_cols(self): + """返回作为关键字的列或列的集合""" + return self._key_cols + + @property + def sign_col(self): + """返回用于判断是否已填数据的列""" + return self._sign_col + + @property + def data_col(self): + """返回用于填充数据的列""" + return self._data_col + + @property + def begin_row(self): + """返回数据开始的行号,用于获取keys,从1开始""" + return self._begin_row + + @property + def keys(self): + """返回一个列表,由未执行的行数据组成。每行的格式为第一位为行号,其余为 key 列的值。 + eg.[3, '张三', 20] + """ + if self.type == 'csv': + return get_csv_keys(self, False) + elif self.type == 'xlsx': + return get_xlsx_keys(self, False) + + @property + def dict_keys(self): + """返回一个列表,由未执行的行数据组成。每行的格式为dict,'row' 值为行号,其余值为第一行数据。 + 如第一行数据为空,则用列号为值。如果begin_row为1,用列名作为值。 + eg.{'row': 2, 'name': '张三', 'C': '男'} + """ + if self.type == 'csv': + return get_csv_keys(self, True) + elif self.type == 'xlsx': + return get_xlsx_keys(self, True) + + @property + def set(self): + """返回用于设置属性的对象""" + if self._setter is None: + self._setter = FillerSetter(self) + return self._setter + + @property + def delimiter(self): + """返回csv文件分隔符""" + return self._delimiter + + @property + def quote_char(self): + """返回csv文件引用符""" + return self._quote_char + + def add_data(self, data, coord='newline', table=None): + """添加数据,每次添加一行数据,可指定坐标、列号或行号 + coord只输入数字(行号)时,列号为self.data_col值,如 3; + 输入列号,或没有行号的坐标时,表示新增一行,列号为此时指定的,如'c'、',3'、(None, 3)、'None,3'; + 输入 'newline' 时,表示新增一行,列号为self.data_col值; + 输入行列坐标时,填写到该坐标,如'a3'、'3,1'、(3,1)、[3,1]; + 输入的行号列号可以是负数,代表从下往上数,-1是倒数第一行,如'a-3'、(-3, -3) + :param data: 要添加的内容,任意格式 + :param coord: 要添加数据的坐标,可输入行号、列号或行列坐标,如'a3'、7、(3, 1)、[3, 1]、'c' + :param table: 要写入的数据表,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + while self._pause_add: # 等待其它线程写入结束 + sleep(.1) + + if not isinstance(data, (list, tuple)): + data = (data,) + + to = self._data + if coord in ('cover_style', 'replace_style', 'set_width', 'set_height'): + to = self._style_data + + elif coord not in ('set_link', 'set_img'): + coord = parse_coord(coord, self.data_col) + if not data: + data = ([],) + self._data_count += 1 + # 一维数组 + elif isinstance(data, dict) or ( + isinstance(data, (list, tuple)) and not isinstance(data[0], (list, tuple, dict))): + data = (data_to_list_or_dict(self, data),) + self._data_count += 1 + else: # 二维数组 + if self.after or self.before: + data = [data_to_list_or_dict(self, d) for d in data] + else: + data = [data_to_list_or_dict_simplify(d) for d in data] + self._data_count += len(data) + + else: + self._data_count += 1 + + if self._type != 'xlsx': + to.append((coord, data)) + + else: + if table is None: + table = self._table + elif isinstance(table, bool): + table = None + to.setdefault(table, []).append((coord, data)) + + if 0 < self.cache_size <= self._data_count: + self.record() + + def set_link(self, coord, link, content=None, table=None): + """为单元格设置超链接 + :param coord: 单元格坐标 + :param link: 超链接,为None时删除链接 + :param content: 单元格内容 + :param table: 数据表名,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + self.add_data((coord, link, content), 'set_link', table) + + def set_style(self, coord, style, replace=True, table=None): + """为单元格设置样式,可批量设置范围内的单元格 + :param coord: 单元格坐标,输入数字可设置整行,输入列名字符串可设置整列,输入'A1:C5'、'a:d'、'1:5'格式可设置指定范围 + :param style: CellStyle对象,为None则清除单元格样式 + :param replace: 是否直接替换已有样式,运行效率较高,但不能单独修改某个属性 + :param table: 数据表名,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + s = 'replace_style' if replace else 'cover_style' + self.add_data((coord, style), s, table) + + def set_img(self, coord, img_path, width=None, height=None, table=None): + """ + :param coord: 单元格坐标 + :param img_path: 图片路径 + :param width: 图片宽 + :param height: 图片高 + :param table: 数据表名,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + if isinstance(img_path, Path): + img_path = str(img_path) + self.add_data((coord, img_path, width, height), 'set_img', table) + + def set_row_height(self, row, height, table=None): + """设置行高,可设置连续多行 + :param row: 行号,可传入范围,如'1:4' + :param height: 行高 + :param table: 数据表名,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + self.add_data((row, height), 'set_height', table) + + def set_col_width(self, col, width, table=None): + """设置列宽,可设置连续多列 + :param col: 列号,数字或字母,可传入范围,如'1:4'、'a:d' + :param width: 列宽 + :param table: 数据表名,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + self.add_data((col, width), 'set_width', table) + + def _record(self): + """记录数据""" + if self.type == 'xlsx': + self._to_xlsx() + elif self.type == 'csv': + self._to_csv() + self._style_data = {} + + def _to_xlsx(self): + """填写数据到xlsx文件""" + wb, new_file = get_wb(self) + tables = [i.title for i in wb.worksheets] + + for table in {}.fromkeys(list(self._data.keys()) + list(self._style_data.keys())): + ws, new_sheet = get_ws(wb, table, tables, new_file) + begin = get_xlsx_head(self, new_file, new_sheet, self._data[table][0], ws, True)[0] + empty = not any([i.value for i in ws[1]]) and ws.max_row == 1 + head = self._head.get(ws.title, None) if self._fit_head else None + + if new_file: # 尝试解决openpyxl的bug + if ws.cell(1, 1).value is None: + ws.cell(1, 1).value = '' + wb.save(self.path) + wb.close() + wb = load_workbook(self.path) + ws = wb[table] if table else wb.active + new_file = False + + if self._data: # 处理表头 + for table_data in self._data[table][begin:]: + set_data_to_ws(ws, table_data, empty, head, self) + empty = False + + if self._style_data: + for table_data in self._style_data[table]: + set_style_to_ws(ws, table_data, self) + + wb.save(self.path) + wb.close() + + def _to_csv(self): + """填写数据到xlsx文件""" + if self._head is not None and not self._file_exists: + create_csv(self) + elif self._head is None: + get_csv_head(self, True) + + with open(self.path, 'r', encoding=self.encoding) as f: + reader = csv_reader(f, delimiter=self.delimiter, quotechar=self.quote_char) + lines = list(reader) + lines_count = len(lines) + + head_len = len(self._head) if self._fit_head else None + for i in self._data: + if i[0] == 'set_link': + coord = parse_coord(i[1][0], self.data_col) + now_data = (f'=HYPERLINK("{i[1][1]}","{i[1][2] or i[1][1]}")',) + + elif i[0] == 'set_img': + continue + + else: + coord = i[0] + now_data = i[1] + + row, col = get_usable_coord_int(coord, lines_count, len(lines[0]) if lines_count else 1) + now_data = (now_data,) if not isinstance(now_data[0], (list, tuple, dict)) else now_data + + for r, data in enumerate(now_data, row): + for _ in range(r - lines_count): # 若行数不够,填充行数 + lines.append([]) + lines_count += 1 + row_num = r - 1 + + if isinstance(data, dict): + if self._fit_head and self._head: + # 若列数不够,填充空列 + lines[row_num].extend([None] * (head_len - len(lines[row_num]))) + for k, h in enumerate(self._head): + if h in data: + lines[row_num][k] = data[h] + lines[row_num] = ok_list(lines[row_num]) + continue + + else: + data = ok_list(data.values()) + + # 若列数不够,填充空列 + lines[row_num].extend([None] * (col - len(lines[row_num]) + len(data) - 1)) + for k, j in enumerate(data): # 填充数据 + lines[row_num][col + k - 1] = process_content(j) + + writer = csv_writer(open(self.path, 'w', encoding=self.encoding, newline=''), delimiter=self.delimiter, + quotechar=self.quote_char) + writer.writerows(lines) + + +def get_xlsx_keys(filler, as_dict): + """返回key列内容,第一位为行号,其余为key列的值 + 如果as_dict为True,返回dict格式,value为第一行值,值为空或begin_row为1时用列号,'row' 值为行号 + eg.[3, '名称', 'id'] + :param filler: 记录器对象 + :param as_dict: 是否以dict格式返回数据 + :return: 关键字组成的列表或字典 + """ + wb = load_workbook(filler.path, data_only=True, read_only=True) + if filler.table and filler.table not in [i.title for i in wb.worksheets]: + raise RuntimeError(f'xlsx文件未包含此工作表:{filler.table}') + ws = wb[filler.table] if filler.table else wb.active + + if ws.max_column is None: # 遇到过read_only时无法获取列数的文件 + wb.close() + wb = load_workbook(filler.path, data_only=True) + ws = wb[filler.table] if filler.table else wb.active + + rows = ws.rows + loop = 0 + if as_dict: + title = [] + if filler.begin_row == 1: + t = [get_column_letter(x) for x in range(1, ws.max_column + 1) + if filler.key_cols is True or x in filler.key_cols] + else: + try: + t = next(rows) + t = [x.value if x.value else get_column_letter(k) for k, x in enumerate(t, 1) + if filler.key_cols is True or k in filler.key_cols] + loop = 1 + + except StopIteration: + return [] + + if len(t) != len(set(t)): + raise RuntimeError('表头出现内容重复。') + title.extend(t) + method = _make_dict_data + + else: + method = _make_list_data + title = None + + try: + for _ in range(filler.begin_row - loop - 1): + next(rows) + except StopIteration: + return [] + + # --------------------------------------------------------- + + if filler.sign_col is True or filler.sign_col > ws.max_column: # 获取所有行 + if filler.key_cols is True: # 获取整行 + res = [method(ind, [i.value for i in row], title) + for ind, row in enumerate(rows, filler.begin_row)] + else: # 只获取对应的列 + res = [method(ind, [row[i - 1].value for i in filler.key_cols], title) + for ind, row in enumerate(rows, filler.begin_row)] + + else: # 获取符合条件的行 + if filler.key_cols is True: # 获取整行 + if filler.deny_sign: + res = [method(ind, [i.value for i in row], title) + for ind, row in enumerate(rows, filler.begin_row) + if row[filler.sign_col - 1].value != filler.sign] + else: + res = [method(ind, [i.value for i in row], title) + for ind, row in enumerate(rows, filler.begin_row) + if row[filler.sign_col - 1].value == filler.sign] + + else: # 只获取对应的列 + if filler.deny_sign: + res = [method(ind, [row[i - 1].value for i in filler.key_cols], title) + for ind, row in enumerate(rows, filler.begin_row) + if row[filler.sign_col - 1].value != filler.sign] + else: + res = [method(ind, [row[i - 1].value for i in filler.key_cols], title) + for ind, row in enumerate(rows, filler.begin_row) + if row[filler.sign_col - 1].value == filler.sign] + + wb.close() + return res + + +def get_csv_keys(filler, as_dict): + """返回key列内容,第一位为行号,其余为key列的值, + 如果as_dict为True,返回dict格式,value为第一行值,值为空或begin_row为1时用列号,'row'值为行号 + eg.[3, '名称', 'id'] + :param filler: 记录器对象 + :param as_dict: 是否以dict格式返回数据 + :return: 关键字组成的列表或字典 + """ + begin_row = filler.begin_row + sign_col = filler.sign_col + sign = '' if filler.sign is None else str(filler.sign) + begin_row -= 1 + res = [] + + with open(filler.path, 'r', encoding=filler.encoding) as f: + reader = csv_reader(f, delimiter=filler.delimiter, quotechar=filler.quote_char) + lines = list(reader) + if not lines: + return res + + if as_dict: + title = [] + if filler.begin_row == 1: + t = [get_column_letter(x) for x in range(1, len(lines[0]) + 1) + if filler.key_cols is True or x in filler.key_cols] + else: + t = [x if x else get_column_letter(k) for k, x in enumerate(lines[0], 1) + if filler.key_cols is True or k in filler.key_cols] + + if len(t) != len(set(t)): + raise RuntimeError('表头内容重复。') + title.extend(t) + method = _make_dict_data + + else: + method = _make_list_data + + if sign_col is not True: # 获取符合条件的行 + sign_col -= 1 + for ind, line in enumerate(lines[begin_row:], begin_row + 1): + row_sign = '' if sign_col > len(line) - 1 else line[sign_col] + if row_sign != sign if filler.deny_sign else row_sign == sign: + if filler.key_cols is True: # 获取整行 + res.append((ind, line)) + else: # 只获取对应的列 + res.append(method(ind, [line[i - 1] for i in filler.key_cols])) + + else: # 获取所有行 + for ind, line in enumerate(lines[begin_row:], begin_row + 1): + if filler.key_cols is True: # 获取整行 + res.append((ind, line)) + else: # 只获取对应的列 + res.append(method(ind, [line[i - 1] for i in filler.key_cols])) + + return res + + +def set_data_to_ws(ws, data, empty, head, filler): + """批量写入数据到sheet""" + max_row = 0 if empty else ws.max_row + if data[0] == 'set_link': + coord = parse_coord(data[1][0], filler.data_col) + row, col = get_usable_coord(coord, max_row, ws) + cell = ws.cell(row, col) + has_link = True if cell.hyperlink else Filler + cell.hyperlink = None if data[1][1] is None else process_content(data[1][1], True) + if data[1][2] is not None: + cell.value = process_content(data[1][2], True) + if data[1][1]: + if filler._link_style: + filler._link_style.to_cell(cell, replace=False) + elif has_link: + NoneStyle().to_cell(cell, replace=False) + return + + elif data[0] == 'set_img': + coord, img_path, width, height = data[1] + coord = parse_coord(coord, filler.data_col) + row, col = get_usable_coord(coord, max_row, ws) + + from openpyxl.drawing.image import Image + img = Image(img_path) + if width and height: + img.width = width + img.height = height + elif width: + img.height = int(img.height * (width / img.width)) + img.width = width + elif height: + img.width = int(img.width * (height / img.height)) + img.height = height + col = get_column_letter(col) + ws.add_image(img, f'{col}{row}') + return + + row, col = get_usable_coord(data[0], max_row, ws) + now_data = (data[1],) if not isinstance(data[1][0], (list, tuple, dict)) else data[1] + + if head: # 自动匹配表头 + for r, i in enumerate(now_data, row): + if isinstance(i, dict): + for k, h in enumerate(head, 1): + if h in i: + ws.cell(r, k).value = process_content(i[h], True) + continue + + for key, j in enumerate(i): + ws.cell(r, col + key).value = process_content(j, True) + + else: + for r, i in enumerate(now_data, row): + if isinstance(i, dict): + i = i.values() + for key, j in enumerate(i): + ws.cell(r, col + key).value = process_content(j, True) + + +def set_style_to_ws(ws, data, filler): + """批量设置单元格格式到sheet""" + if data[0] in ('replace_style', 'cover_style'): + mode = data[0] == 'replace_style' + coord = data[1][0] + style = NoneStyle() if data[1][1] is None else data[1][1] + if isinstance(coord, int) or (isinstance(coord, str) and coord.isdigit()): + for c in ws[coord]: + style.to_cell(c, replace=mode) + return + + elif isinstance(coord, str): + if ':' in coord: + for c in ws[coord]: + for cc in c: + style.to_cell(cc, replace=mode) + return + elif coord.isdigit() or coord.isalpha(): + for c in ws[coord]: + style.to_cell(c, replace=mode) + return + + coord = parse_coord(coord, filler.data_col) + row, col = get_usable_coord(coord, ws.max_row, ws) + style.to_cell(ws.cell(row, col), replace=mode) + + elif data[0] == 'set_width': + col, width = data[1] + if isinstance(col, int): + col = get_column_letter(col) + for c in col.split(':'): + if c.isdigit(): + c = get_column_letter(int(c)) + ws.column_dimensions[c].width = width + + elif data[0] == 'set_height': + row, height = data[1] + if isinstance(row, int): + ws.row_dimensions[row].height = height + elif isinstance(row, str): + for r in row.split(':'): + ws.row_dimensions[int(r)].height = height + + +def _make_list_data(ind, data, title=None): + r = FillerList(data) + r.row = ind + return r + + +def _make_dict_data(ind, data, title): + r = FillerDict(zip(title, data)) + r.row = ind + return r diff --git a/src/flaresolverr/DataRecorder/filler.pyi b/src/flaresolverr/DataRecorder/filler.pyi new file mode 100644 index 0000000000..23618bf42a --- /dev/null +++ b/src/flaresolverr/DataRecorder/filler.pyi @@ -0,0 +1,104 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from typing import Union, List, Any, Tuple, Optional + +from .base import BaseRecorder +from .style import CellStyle +from .setter import FillerSetter +from .tools import FillerDict, FillerList + + +class Filler(BaseRecorder): + _set: FillerSetter = ... + _key_cols: Union[List[int], bool, None] = ... + _begin_row: Union[None, str, int] = ... + _sign_col: Union[None, int, bool] = ... + _data_col: Optional[int] = ... + _sign: Any = ... + _deny_sign: bool = ... + _link_style: CellStyle = ... + _quote_char: str = ... + _delimiter: str = ... + _data: Union[list, dict] = ... + _head: Union[None, list, dict] = ... + _fit_head: bool = ... + data: Union[list, dict] = ... + _style_data: Union[list, dict] = ... + + def __init__(self, path: Union[None, str, Path] = None, + cache_size: int = None, + key_cols: Union[str, int, list, tuple, bool] = True, + begin_row: int = 2, + sign_col: Union[str, int, bool] = True, + data_col: Union[None, int, str] = None, + sign: Any = None, + deny_sign: bool = False) -> None: ... + + @property + def sign(self) -> str: ... + + @property + def deny_sign(self) -> bool: ... + + @property + def key_cols(self) -> Union[List[int], bool]: ... + + @property + def sign_col(self) -> Union[None, int, bool]: ... + + @property + def data_col(self) -> int: ... + + @property + def begin_row(self) -> Union[str, int]: ... + + @property + def keys(self) -> List[FillerList]: ... + + @property + def dict_keys(self) -> List[FillerDict]: ... + + @property + def set(self) -> FillerSetter: ... + + @property + def delimiter(self) -> str: ... + + @property + def quote_char(self) -> str: ... + + def add_data(self, data: Any, + coord: Union[list, Tuple[Union[None, int, str], Union[int, str]], str, int] = 'newline', + table: Union[str, bool] = None) -> None: ... + + def set_link(self, + coord: Union[int, str, tuple, list], + link: Optional[str], + content: Union[None, int, str, float] = None, + table: Union[str, bool] = None) -> None: ... + + def set_style(self, coord: Union[int, str, tuple, list], style: Optional[CellStyle], + replace: bool = True, + table: Union[str, bool] = None) -> None: ... + + def set_img(self, coord: Union[int, str, tuple, list], img_path: Union[None, str, Path], width: float = None, + height: float = None, + table: Union[str, bool] = None) -> None: ... + + def set_row_height(self, row: Union[int, str], height: float, + table: Union[str, bool] = None) -> None: ... + + def set_col_width(self, col: Union[int, str], width: float, + table: Union[str, bool] = None) -> None: ... + + def _record(self) -> None: ... + + def _to_xlsx(self) -> None: ... + + def _to_csv(self) -> None: ... + + +def get_xlsx_keys(filler: Filler, as_dict: bool) -> List[Union[list, dict]]: ... + + +def get_csv_keys(filler: Filler, as_dict: bool) -> List[Union[list, dict]]: ... diff --git a/src/flaresolverr/DataRecorder/recorder.py b/src/flaresolverr/DataRecorder/recorder.py new file mode 100644 index 0000000000..57da79e118 --- /dev/null +++ b/src/flaresolverr/DataRecorder/recorder.py @@ -0,0 +1,268 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from time import sleep + +from openpyxl.reader.excel import load_workbook + +from .base import BaseRecorder +from .setter import RecorderSetter, set_csv_head +from .style.cell_style import CellStyleCopier, CellStyle +from .tools import (ok_list, data_to_list_or_dict, process_content, data_to_list_or_dict_simplify, get_csv_head, get_wb, + get_ws, get_xlsx_head, create_csv) + + +class Recorder(BaseRecorder): + _SUPPORTS = ('csv', 'xlsx', 'json', 'txt') + + def __init__(self, path=None, cache_size=None): + """用于缓存并记录数据,可在达到一定数量时自动记录,以降低文件读写次数,减少开销 + :param path: 保存的文件路径 + :param cache_size: 每接收多少条记录写入文件,0为不自动写入 + """ + super().__init__(path=path, cache_size=cache_size) + self._delimiter = ',' # csv文件分隔符 + self._quote_char = '"' # csv文件引用符 + self._follow_styles = False + self._col_height = None + self._style = None + self._fit_head = False + self._auto_new_col = False + + @property + def set(self): + """返回用于设置属性的对象""" + if self._setter is None: + self._setter = RecorderSetter(self) + return self._setter + + @property + def delimiter(self): + """返回csv文件分隔符""" + return self._delimiter + + @property + def quote_char(self): + """返回csv文件引用符""" + return self._quote_char + + def add_data(self, data, table=None): + """添加数据,可一次添加多条数据 + :param data: 插入的数据,任意格式 + :param table: 要写入的数据表,仅支持xlsx格式。为None表示用set.table()方法设置的值,为bool表示活动的表格 + :return: None + """ + while self._pause_add: # 等待其它线程写入结束 + sleep(.1) + + if not isinstance(data, (list, tuple, dict)): + data = (data,) + + if not data: + data = ([],) + self._data_count += 1 + + # 一维数组 + elif isinstance(data, dict) or (isinstance(data, (list, tuple)) + and not isinstance(data[0], (list, tuple, dict))): + data = [data_to_list_or_dict(self, data)] + self._data_count += 1 + + else: # 二维数组 + if self.after or self.before: + data = [data_to_list_or_dict(self, d) for d in data] + else: + data = [data_to_list_or_dict_simplify(d) for d in data] + self._data_count += len(data) + + if self._type != 'xlsx': + self._data.extend(data) + + else: + if table is None: + table = self._table + elif isinstance(table, bool): + table = None + + self._data.setdefault(table, []).extend(data) + + if 0 < self.cache_size <= self._data_count: + self.record() + + def _record(self): + """记录数据""" + if self.type == 'csv': + self._to_csv() + elif self.type == 'xlsx': + self._to_xlsx() + elif self.type == 'json': + self._to_json() + elif self.type == 'txt': + self._to_txt() + + def _to_xlsx(self): + """记录数据到xlsx文件""" + wb, new_file = get_wb(self) + tables = [i.title for i in wb.worksheets] + + for table, data in self._data.items(): + _row_styles = None + _col_height = None + ws, new_sheet = get_ws(wb, table, tables, new_file) + + # ---------处理表头和样式--------- + begin, new_head = get_xlsx_head(self, new_file, new_sheet, self._data[table][0], ws) + if new_head and (self._style or self._col_height): + _row_styles = [self._style] * len(ws[1]) if isinstance(self._style, CellStyle) else self._style + _set_style(self._col_height, _row_styles, ws, 1) + if (new_file or new_sheet) and (self._style or self._col_height): + begin = 1 + ws.append(ok_list(data[0], True)) + _row_styles = [self._style] * len(ws[1]) if isinstance(self._style, CellStyle) else self._style + _set_style(self._col_height, _row_styles, ws, 1) + + max_row = None + if self._follow_styles: + max_row = ws.max_row + _row_styles = [CellStyleCopier(i) for i in ws[max_row]] + _col_height = ws.row_dimensions[max_row].height + elif self._style or self._col_height: + max_row = ws.max_row + + if new_file: # 尝试解决openpyxl的bug + if ws.cell(1, 1).value is None: + ws.cell(1, 1).value = '' + wb.save(self.path) + wb.close() + wb = load_workbook(self.path) + ws = wb[table] if table else wb.active + new_file = False + + # ==============开始写入数据============== + if self._fit_head and self._head[ws.title]: + rewrite_head = False + if self._follow_styles: + for i in data[begin:]: + i, rewrite_head = _fit_head_handle(i, self, ws, rewrite_head) + ws.append(ok_list(i, True)) + max_row += 1 + _set_style(_col_height, _row_styles, ws, max_row) + + elif self._style or self._col_height: + for i in data[begin:]: + i, rewrite_head = _fit_head_handle(i, self, ws, rewrite_head) + ws.append(ok_list(i, True)) + max_row += 1 + _row_styles = _style_handle(self, i) + _set_style(self._col_height, _row_styles, ws, max_row) + + else: + for i in data[begin:]: + i, rewrite_head = _fit_head_handle(i, self, ws, rewrite_head) + ws.append(ok_list(i, True)) + + if rewrite_head: + for c in ws[1]: + c.value = None + for k, i in enumerate(self._head[ws.title], 1): + ws.cell(1, k).value = i + + else: + if self._follow_styles: + for i in data[begin:]: + ws.append(ok_list(i, True)) + max_row += 1 + _set_style(_col_height, _row_styles, ws, max_row) + + elif self._style or self._col_height: + for i in data[begin:]: + ws.append(ok_list(i, True)) + max_row += 1 + _row_styles = _style_handle(self, i) + _set_style(self._col_height, _row_styles, ws, max_row) + + else: + for i in data[begin:]: + ws.append(ok_list(i, True)) + + wb.save(self.path) + wb.close() + + def _to_csv(self): + """记录数据到csv文件""" + if self._head is not None and not self._file_exists: + create_csv(self) + elif self._head is None: + get_csv_head(self) + + rewrite_head = False + with open(self.path, 'a+', newline='', encoding=self.encoding) as f: + from csv import writer + csv_write = writer(f, delimiter=self.delimiter, quotechar=self.quote_char) + if self._fit_head and self._head: + for i in self._data: + if isinstance(i, dict): + if self._auto_new_col and set(self._head) != set(i.keys()): + self._head += [t for t in i.keys() if t not in self._head] + rewrite_head = True + i = [i.get(h, None) for h in self._head] + csv_write.writerow(ok_list(i)) + + else: + for i in self._data: + csv_write.writerow(ok_list(i)) + + if rewrite_head: + set_csv_head(self, self._head, True) + + def _to_txt(self): + """记录数据到txt文件""" + with open(self.path, 'a+', encoding=self.encoding) as f: + all_data = [' '.join(ok_list(i, as_str=True)) for i in self._data] + f.write('\n'.join(all_data) + '\n') + + def _to_json(self): + """记录数据到json文件""" + from json import load, dump + if self._file_exists or Path(self.path).exists(): + with open(self.path, 'r', encoding=self.encoding) as f: + json_data = load(f) + + else: + json_data = [] + + for i in self._data: + if isinstance(i, dict): + for d in i: + i[d] = process_content(i[d]) + json_data.append(i) + else: + json_data.append([process_content(d) for d in i]) + + self._file_exists = True + with open(self.path, 'w', encoding=self.encoding) as f: + dump(json_data, f) + + +def _set_style(_col_height, _row_styles, ws, row): + if _col_height is not None: + ws.row_dimensions[row].height = _col_height + + if _row_styles: + for k, s in enumerate(_row_styles, start=1): + if s: + s.to_cell(ws.cell(row=row, column=k)) + + +def _fit_head_handle(data, recorder, ws, rewrite_head): + """处理需要匹配表头时数据""" + if isinstance(data, dict): + if recorder._auto_new_col and set(recorder._head[ws.title]) != set(data.keys()): + recorder._head[ws.title] += [t for t in data.keys() if t not in recorder._head[ws.title]] + rewrite_head = True + data = [data.get(h, None) for h in recorder._head[ws.title]] + return data, rewrite_head + + +def _style_handle(recorder, data): + """处理需要匹配样式时数据""" + return [recorder._style] * len(data) if isinstance(recorder._style, CellStyle) else recorder._style diff --git a/src/flaresolverr/DataRecorder/recorder.pyi b/src/flaresolverr/DataRecorder/recorder.pyi new file mode 100644 index 0000000000..7642d35e66 --- /dev/null +++ b/src/flaresolverr/DataRecorder/recorder.pyi @@ -0,0 +1,41 @@ +# -*- coding:utf-8 -*- +from typing import Any, Optional, Union + +from .base import BaseRecorder +from .style.cell_style import CellStyle +from .setter import RecorderSetter + + +class Recorder(BaseRecorder): + _set: RecorderSetter = ... + _col_height: Optional[float] = ... + _follow_styles: bool = ... + _style: Optional[CellStyle] = ... + _quote_char: str = ... + _delimiter: str = ... + _data: Union[list, dict] = ... + data: Union[list, dict] = ... + _head: Union[list, dict, None] = ... + _fit_head: bool = ... + _auto_new_col: bool = ... + + @property + def set(self) -> RecorderSetter: ... + + @property + def delimiter(self) -> str: ... + + @property + def quote_char(self) -> str: ... + + def add_data(self, data: Any, table: Union[str, bool] = None) -> None: ... + + def _record(self) -> None: ... + + def _to_xlsx(self) -> None: ... + + def _to_csv(self) -> None: ... + + def _to_txt(self) -> None: ... + + def _to_json(self) -> None: ... diff --git a/src/flaresolverr/DataRecorder/setter.py b/src/flaresolverr/DataRecorder/setter.py new file mode 100644 index 0000000000..b0c7ac854a --- /dev/null +++ b/src/flaresolverr/DataRecorder/setter.py @@ -0,0 +1,441 @@ +# -*- coding:utf-8 -*- +from pathlib import Path + +from openpyxl.reader.excel import load_workbook +from openpyxl.utils import column_index_from_string +from openpyxl.workbook import Workbook + +from .tools import process_content, ok_list, make_valid_name + + +class OriginalSetter(object): + def __init__(self, recorder): + self._recorder = recorder + + def cache_size(self, size): + """设置缓存大小 + :param size: 缓存大小 + :return: None + """ + if not isinstance(size, int) or size < 0: + raise TypeError('cache_size值只能是int,且必须>=0') + self._recorder._cache = size + return self + + def path(self, path): + """设置文件路径 + :param path: 文件路径 + :return: None + """ + if self._recorder._path: + self._recorder.record() + + p = Path(path) + self._recorder._path = str(p.parent / make_valid_name(p.name)) + self._recorder._data = [] + return self + + def show_msg(self, on_off): + """设置是否显示运行信息 + :param on_off: bool表示开关 + :return: None + """ + self._recorder.show_msg = on_off + return self + + +class BaseSetter(OriginalSetter): + def table(self, name): + """设置默认表名 + :param name: 表名 + :return: None + """ + self._recorder._table = name + return self + + def before(self, before): + """设置在数据前面补充的列 + :param before: 列表、元组或字符串,为字符串时则补充一列 + :return: None + """ + if before is None: + self._recorder._before = None + elif isinstance(before, (list, dict)): + self._recorder._before = before + elif isinstance(before, tuple): + self._recorder._before = list(before) + else: + self._recorder._before = [before] + return self + + def after(self, after): + """设置在数据后面补充的列 + :param after: 列表、元组或字符串,为字符串时则补充一列 + :return: None + """ + if after is None: + self._recorder._after = None + elif isinstance(after, (list, dict)): + self._recorder._after = after + elif isinstance(after, tuple): + self._recorder._after = list(after) + else: + self._recorder._after = [after] + return self + + def encoding(self, encoding): + """设置编码 + :param encoding: 编码格式 + :return: None + """ + self._recorder._encoding = encoding + return self + + +class SheetLikeSetter(BaseSetter): + def head(self, head, table=None, to_file=True): + """设置表头。只有 csv 和 xlsx 格式支持设置表头 + :param head: 表头,列表或元组 + :param table: 表名,只xlsx格式文件有效 + :param to_file: 是否写入到文件 + :return: None + """ + self._recorder.record() + with self._recorder._lock: + if not self._recorder.path: + raise FileNotFoundError('未指定文件。') + if not isinstance(head, (list, tuple)): + raise TypeError('head参数只能是list或tuple格式。') + + if self._recorder.type == 'xlsx': + table = table or self._recorder.table + set_xlsx_head(self._recorder, head, table, to_file) + + elif self._recorder.type == 'csv': + set_csv_head(self._recorder, head, to_file) + + else: + raise TypeError('只能对xlsx和csv文件设置表头。') + return self + + def delimiter(self, delimiter): + """设置csv文件分隔符 + :param delimiter: 分隔符 + :return: None + """ + self._recorder._delimiter = delimiter + return self + + def quote_char(self, quote_char): + """设置csv文件引用符 + :param quote_char: 引用符 + :return: None + """ + self._recorder._quote_char = quote_char + return self + + def path(self, path, file_type=None): + """设置文件路径 + :param path: 文件路径 + :param file_type: 要设置的文件类型,为空则从文件名中获取 + :return: None + """ + super().path(path) + + if not file_type: + suffix = Path(path).suffix.lower() + if suffix: + file_type = suffix[1:] + elif not self._recorder.type: + file_type = 'csv' + + if file_type: + self.file_type(file_type) + + if self._recorder._type == 'xlsx': + self._recorder._data = {} + self._recorder._head = {} + self._recorder._style_data = {} + else: + self._recorder._data = [] + self._recorder._head = None + + return self + + def file_type(self, file_type): + """指定文件类型,无视文件后缀名""" + if 'any' not in self._recorder._SUPPORTS and file_type not in self._recorder._SUPPORTS: + raise TypeError(f'只支持{"、".join(self._recorder._SUPPORTS)}格式文件。') + self._recorder._type = file_type + return self + + def table(self, name): + """设置默认表名 + :param name: 表名 + :return: None + """ + if isinstance(name, bool): + name = None + self._recorder._table = name + return self + + def fit_head(self, on_off=True): + """设置是否自动匹配表头 + :param on_off: bool表示开关 + :return: None + """ + if self._recorder.type not in ('csv', 'xlsx'): + raise TypeError('只有csv或xlsx格式可设置fit_head。') + self._recorder.record() + self._recorder._fit_head = on_off + return self + + +class FillerSetter(SheetLikeSetter): + def sign(self, value): + """设置sign值 + :param value: 筛选条件 + :return: None + """ + self._recorder._sign = value + return self + + def deny_sign(self, on_off=True): + """设置是否反向匹配sign + :param on_off: bool表示开或关 + :return: None + """ + self._recorder._deny_sign = on_off + return self + + def key_cols(self, cols): + """设置作为关键字的列,可以是多列 + :param cols: 列号或列名,或它们组成的list或tuple + :return: None + """ + if cols is True: + self._recorder._key_cols = True + elif isinstance(cols, int) and cols > 0: + self._recorder._key_cols = [cols] + elif isinstance(cols, str): + self._recorder._key_cols = [int(cols)] if cols.isdigit() else [column_index_from_string(cols)] + elif isinstance(cols, (list, tuple)): + self._recorder._key_cols = [i if isinstance(i, int) and i > 0 else + int(i) if i.isdigit() else column_index_from_string(i) for i in cols] + else: + raise TypeError('col值只能是int或str,且必须大于0。') + return self + + def sign_col(self, col): + """设置用于判断是否已填数据的列 + :param col: 列号或列名 + :return: None + """ + if col is True or (isinstance(col, int) and col > 0): + self._recorder._sign_col = col + elif isinstance(col, str): + self._recorder._sign_col = int(col) if col.isdigit() else column_index_from_string(col) + else: + raise TypeError('col值只能是True、int或str,且必须大于0。') + return self + + def data_col(self, col): + """设置用于填充数据的列 + :param col: 列号或列名 + :return: None + """ + if isinstance(col, int) and col > 0: + self._recorder._data_col = col + elif isinstance(col, str): + self._recorder._data_col = column_index_from_string(col) + else: + raise TypeError('col值只能是int或str,且必须大于0。') + return self + + def begin_row(self, row): + """设置数据开始的行 + :param row: 行号 + :return: None + """ + if not isinstance(row, int) or row < 1: + raise TypeError('row值只能是int,且必须大于0') + self._recorder._begin_row = row + return self + + def path(self, path=None, key_cols=None, begin_row=None, sign_col=None, + data_col=None, sign=None, deny_sign=None): + """设置文件路径 + :param path: 保存的文件路径 + :param key_cols: 作为关键字的列,可以是多列 + :param begin_row: 数据开始的行,默认表头一行 + :param sign_col: 用于判断是否已填数据的列 + :param data_col: 要填入数据的第一列 + :param sign: 按这个值判断是否已填数据 + :param deny_sign: 是否反向匹配sign,即筛选指不是sign的行 + """ + if path: + super().path(path) + self._recorder.set.key_cols(key_cols or self._recorder.key_cols) + self._recorder.set.begin_row(begin_row or self._recorder.begin_row) + self._recorder.set.sign_col(sign_col or self._recorder.sign_col) + self._recorder.set.sign(sign or self._recorder.sign) + self._recorder.set.data_col(data_col or self._recorder.data_col) + self._recorder.set.deny_sign(deny_sign if deny_sign is not None else self._recorder.deny_sign) + return self + + def link_style(self, style): + """设置单元格的链接样式 + :param style: CellStyle对象 + :return: None + """ + self._recorder._link_style = style + return self + + +class RecorderSetter(SheetLikeSetter): + def follow_styles(self, on_off=True): + """设置是否跟随最后一行的style,只有xlsx格式有效 + :param on_off: True或False + :return: None + """ + self._recorder._follow_styles = on_off + return self + + def col_height(self, height): + """设置行高,只有xlsx格式有效 + :param height: 行高,传入None清空设置 + :return: None + """ + self._recorder._col_height = height + return self + + def styles(self, styles): + """设置新行样式,只有xlsx格式有效,可传入多个,传入None则取消 + :param styles: 传入CellStyle对象设置整个新行,传入CellStyle对象组成的列表设置多个,传入None清空设置 + :return: None + """ + self._recorder.record() + self._recorder._follow_styles = False + self._recorder._style = styles + return self + + def path(self, path, file_type=None): + """设置文件路径 + :param path: 文件路径 + :param file_type: 要设置的文件类型,为空则从文件名中获取 + :return: None + """ + super().path(path=path, file_type=file_type) + self._recorder._row_styles = None + return self + + def fit_head(self, on_off=True, add_new=False): + """设置是否自动匹配表头 + :param on_off: bool表示开关 + :param add_new: 数据中有表头不存在的列时是否自动添加到表头,on_off为True时才有效 + :return: None + """ + super().fit_head(on_off) + self._recorder._auto_new_col = add_new + return self + + +class DBSetter(BaseSetter): + def path(self, path, table=None): + """重写父类方法 + :param path: 文件路径 + :param table: 数据表名称 + :return: None + """ + with self._recorder._lock: + super().path(path) + if self._recorder._conn is not None: + self._recorder._close_connection() + self._recorder._connect() + + if table: + self._recorder._table = table + + else: + r = self._recorder.run_sql("select name from sqlite_master where type='table'") + self._recorder._table = r[0] if r else None + + self._recorder._data = {} + self._recorder._close_connection() + return self + + +def set_csv_head(recorder, head, to_file): + """设置csv文件的表头 + :param recorder: Recorder或Filler对象 + :param head: 表头列表或元组 + :param to_file: 是否写入文件 + :return: None + """ + recorder._head = head + if not to_file: + return + + from csv import writer + if recorder._file_exists or Path(recorder.path).exists(): + with open(recorder.path, 'r', newline='', encoding=recorder._encoding) as f: + content = "".join(f.readlines()[1:]) + + with open(recorder.path, 'w', newline='', encoding=recorder._encoding) as f: + csv_write = writer(f, delimiter=recorder._delimiter, quotechar=recorder._quote_char) + csv_write.writerow(ok_list(head)) + + with open(recorder.path, 'a+', newline='', encoding=recorder._encoding) as f: + f.write(f'{content}') + + else: + Path(recorder.path).parent.mkdir(parents=True, exist_ok=True) + with open(recorder.path, 'w', newline='', encoding=recorder._encoding) as f: + csv_write = writer(f, delimiter=recorder._delimiter, quotechar=recorder._quote_char) + csv_write.writerow(ok_list(head)) + + +def set_xlsx_head(recorder, head, table, to_file): + """设置xlsx文件的表头 + :param recorder: Recorder或Filler对象 + :param head: 表头列表或元组 + :param table: 工作表名称 + :param to_file: 是否写入文件 + :return: None + """ + if not to_file: + if table: + recorder._head[table] = head + elif recorder._file_exists or Path(recorder.path).exists(): + wb = load_workbook(recorder.path) + ws = wb.active + recorder._head[ws.title] = head + wb.close() + else: + recorder._head['Sheet'] = head + return + + if recorder._file_exists or Path(recorder.path).exists(): + wb = load_workbook(recorder.path) + if table: + ws = wb[table] if table in [i.title for i in wb.worksheets] else wb.create_sheet(title=table) + else: + ws = wb.active + + else: + Path(recorder.path).parent.mkdir(parents=True, exist_ok=True) + wb = Workbook() + ws = wb.active + if table: + ws.title = table + + if len(ws[1]) > len(head): + head = list(head) + head.extend([None] * (len(ws[1]) - len(head))) + + for key, i in enumerate(head, 1): + ws.cell(1, key).value = process_content(i, True) + + recorder._head[ws.title] = head + wb.save(recorder.path) + wb.close() diff --git a/src/flaresolverr/DataRecorder/setter.pyi b/src/flaresolverr/DataRecorder/setter.pyi new file mode 100644 index 0000000000..4be18edec5 --- /dev/null +++ b/src/flaresolverr/DataRecorder/setter.pyi @@ -0,0 +1,105 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from typing import Union, Any, Optional, List + +from .base import OriginalRecorder, BaseRecorder +from .style import CellStyle +from .db_recorder import DBRecorder +from .filler import Filler +from .recorder import Recorder + + +class OriginalSetter(object): + _recorder: OriginalRecorder = ... + + def __init__(self, recorder: OriginalRecorder): ... + + def cache_size(self, size: int) -> OriginalSetter: ... + + def path(self, path: Union[str, Path]) -> OriginalSetter: ... + + def show_msg(self, on_off: bool) -> OriginalSetter: ... + + +class BaseSetter(OriginalSetter): + _recorder: BaseRecorder = ... + + def table(self, name: Union[str, bool]) -> BaseSetter: ... + + def before(self, before: Any) -> BaseSetter: ... + + def after(self, after: Any) -> BaseSetter: ... + + def encoding(self, encoding: str) -> BaseSetter: ... + + +class SheetLikeSetter(BaseSetter): + _recorder: Union[Filler, Recorder] = ... + + def head(self, head: Union[list, tuple], table: str = None, to_file: bool = True) -> SheetLikeSetter: ... + + def delimiter(self, delimiter: str) -> SheetLikeSetter: ... + + def quote_char(self, quote_char: str) -> SheetLikeSetter: ... + + def path(self, path: Union[str, Path], file_type: str = None) -> SheetLikeSetter: ... + + def file_type(self, file_type: str) -> SheetLikeSetter: ... + + def table(self, name: Union[str, bool]) -> SheetLikeSetter: ... + + def fit_head(self, on_off: bool = True) -> SheetLikeSetter: ... + + +class FillerSetter(SheetLikeSetter): + _recorder: Filler = ... + + def __init__(self, recorder: Filler): ... + + def sign(self, value: Any) -> FillerSetter: ... + + def deny_sign(self, on_off: bool = True) -> FillerSetter: ... + + def key_cols(self, cols: Union[str, int, list, tuple, bool]) -> FillerSetter: ... + + def sign_col(self, col: Union[str, int, bool]) -> FillerSetter: ... + + def data_col(self, col: Union[str, int]) -> FillerSetter: ... + + def begin_row(self, row: int) -> FillerSetter: ... + + def path(self, path: Union[str, Path] = None, key_cols: Union[str, int, list, tuple, bool] = None, + begin_row: int = None, sign_col: Union[str, int, bool] = None, + data_col: Union[str, int] = None, sign: Any = None, deny_sign: bool = None) -> FillerSetter: ... + + def link_style(self, style: CellStyle) -> FillerSetter: ... + + +class RecorderSetter(SheetLikeSetter): + _recorder: Recorder = ... + + def __init__(self, recorder: Recorder): ... + + def follow_styles(self, on_off: bool = True) -> RecorderSetter: ... + + def col_height(self, height: float) -> Optional[RecorderSetter]: ... + + def styles(self, styles: Union[CellStyle, List[CellStyle], Tuple[CellStyle], None]) -> RecorderSetter: ... + + def path(self, path: Union[str, Path], file_type: str = None) -> RecorderSetter: ... + + def fit_head(self, on_off: bool = True, add_new: bool = False) -> RecorderSetter: ... + + +class DBSetter(BaseSetter): + _recorder: DBRecorder = ... + + def __init__(self, recorder: DBRecorder): ... + + def path(self, path: Union[str, Path], table: Optional[str] = None) -> DBSetter: ... + + +def set_csv_head(recorder: Union[Recorder, Filler], head: Union[list, tuple], to_file: bool) -> None: ... + + +def set_xlsx_head(recorder: Union[Recorder, Filler], head: Union[list, tuple], table: str, to_file: bool) -> None: ... diff --git a/src/flaresolverr/DataRecorder/style/__init__.py b/src/flaresolverr/DataRecorder/style/__init__.py new file mode 100644 index 0000000000..981247d17d --- /dev/null +++ b/src/flaresolverr/DataRecorder/style/__init__.py @@ -0,0 +1,2 @@ +from openpyxl.styles.colors import Color +from .cell_style import CellStyle diff --git a/src/flaresolverr/DataRecorder/style/cell_style.py b/src/flaresolverr/DataRecorder/style/cell_style.py new file mode 100644 index 0000000000..3d5448d597 --- /dev/null +++ b/src/flaresolverr/DataRecorder/style/cell_style.py @@ -0,0 +1,765 @@ +# -*- coding:utf-8 -*- +from copy import copy +from threading import Lock + +from openpyxl.styles import Alignment, Font, Side, Border, Protection, GradientFill, PatternFill, Color + + +class CellStyle(object): + font_args = ('name', 'size', 'charset', 'underline', 'color', 'scheme', 'vertAlign', + 'bold', 'italic', 'strike', 'outline', 'shadow', 'condense', 'extend') + border_args = ('start', 'end', 'left', 'right', 'top', 'bottom', 'diagonal', 'vertical', 'horizontal', + 'horizontal', 'outline', 'diagonalUp', 'diagonalDown') + alignment_args = ('horizontal', 'vertical', 'indent', 'relativeIndent', 'justifyLastLine', 'readingOrder', + 'textRotation', 'wrapText', 'shrinkToFit') + protection_args = ('locked', 'hidden') + gradient_fill_args = ('type', 'degree', 'left', 'right', 'top', 'bottom', 'stop') + pattern_fill_args = ('patternType', 'fgColor', 'bgColor') + + def __init__(self): + """用于管理单元格样式的类""" + self._font = None + self._border = None + self._alignment = None + self._pattern_fill = None + self._gradient_fill = None + self._number_format = None + self._protection = None + + # 用于覆盖目标单元格的对象 + self._Font = None + self._Border = None + self._Alignment = None + self._Fill = None + self._Protection = None + + @property + def font(self): + """返回用于设置单元格字体的对象""" + if self._font is None: + self._font = CellFont() + return self._font + + @property + def border(self): + """返回用于设置单元格边框的对象""" + if self._border is None: + self._border = CellBorder() + return self._border + + @property + def alignment(self): + """返回用于设置单元格对齐选项的对象""" + if self._alignment is None: + self._alignment = CellAlignment() + return self._alignment + + @property + def pattern_fill(self): + """返回用于设置单元格图案填充的对象""" + self._gradient_fill = None + if self._pattern_fill is None: + self._pattern_fill = CellPatternFill() + return self._pattern_fill + + @property + def gradient_fill(self): + """返回用于设置单元格渐变填充的对象""" + self._pattern_fill = None + if self._gradient_fill is None: + self._gradient_fill = CellGradientFill() + return self._gradient_fill + + @property + def number_format(self): + """返回用于设置单元格数字格式的对象""" + if self._number_format is None: + self._number_format = CellNumberFormat() + return self._number_format + + @property + def protection(self): + """返回用于设置单元格保护选项的对象""" + if self._protection is None: + self._protection = CellProtection() + return self._protection + + def to_cell(self, cell, replace=True): + """把当前样式复制到目标单元格 + :param cell: 被设置样式的单元格对象 + :param replace: 是否直接替换目标单元格的样式,是的话效率较高,但不能保留未被设置的原有样式项 + :return: None + """ + if replace: + self._replace_to_cell(cell) + else: + self._cover_to_cell(cell) + + def _cover_to_cell(self, cell): + """把当前样式复制到目标单元格,只覆盖有设置的项,没有设置的原有的项不变 + :param cell: 被设置样式的单元格对象 + :return: None + """ + if self._font: + d = _handle_args(self.font_args, self._font, cell.font) + d['family'] = cell.font.family + cell.font = Font(**d) + + if self._border: + d = _handle_args(self.border_args, self._border, cell.border) + cell.border = Border(**d) + + if self._alignment: + d = _handle_args(self.alignment_args, self._alignment, cell.alignment) + cell.alignment = Alignment(**d) + + if self._pattern_fill: + f = None if 'fills.GradientFill' in str(cell.fill) else cell.fill + d = _handle_args(self.pattern_fill_args, self._pattern_fill, f) + cell.fill = PatternFill(**d) + + elif self._gradient_fill: + f = None if 'fills.PatternFill' in str(cell.fill) else cell.fill + d = _handle_args(self.gradient_fill_args, self._gradient_fill, f) + cell.fill = GradientFill(**d) + + if self._number_format and self._number_format.format != 'notSet': + cell.number_format = self._number_format.format + + if self._protection: + d = _handle_args(self.protection_args, self._protection, cell.protection) + cell.protection = Protection(**d) + + def _replace_to_cell(self, cell): + """把当前样式复制到目标单元格,覆盖原有的设置 + :param cell: 被设置样式的单元格对象 + :return: None + """ + if self._font: + if self._Font is None: + d = _handle_args(self.font_args, self._font, None) + self._Font = Font(**d) + cell.font = self._Font + + if self._border: + if self._Border is None: + d = _handle_args(self.border_args, self._border, None) + self._Border = Border(**d) + cell.border = self._Border + + if self._alignment: + if self._Alignment is None: + d = _handle_args(self.alignment_args, self._alignment, None) + self._Alignment = Alignment(**d) + cell.alignment = self._Alignment + + if self._pattern_fill: + if not isinstance(self._Fill, PatternFill): + d = _handle_args(self.pattern_fill_args, self._pattern_fill, None) + self._Fill = PatternFill(**d) + cell.fill = self._Fill + + elif self._gradient_fill: + if not isinstance(self._Fill, GradientFill): + d = _handle_args(self.gradient_fill_args, self._gradient_fill, None) + self._Fill = GradientFill(**d) + cell.fill = self._Fill + + if self._number_format and self._number_format.format != 'notSet': + cell.number_format = self._number_format.format + + if self._protection: + if self._Protection is None: + d = _handle_args(self.protection_args, self._protection, None) + self._Protection = Protection(**d) + cell.protection = self._Protection + + +def _handle_args(args, src, target=None): + d = {} + for arg in args: + tmp = getattr(src, arg) + if tmp != 'notSet': + d[arg] = tmp + elif target: + d[arg] = getattr(target, arg) + return d + + +class CellFont(object): + _LINE_STYLES = ('single', 'double', 'singleAccounting', 'doubleAccounting', None) + _SCHEMES = ('major', 'minor', None) + _VERT_ALIGNS = ('superscript', 'subscript', 'baseline', None) + + def __init__(self): + self.name = 'notSet' + self.charset = 'notSet' + self.size = 'notSet' + self.bold = 'notSet' + self.italic = 'notSet' + self.strike = 'notSet' + self.outline = 'notSet' + self.shadow = 'notSet' + self.condense = 'notSet' + self.extend = 'notSet' + self.underline = 'notSet' + self.vertAlign = 'notSet' + self.color = 'notSet' + self.scheme = 'notSet' + + def set_name(self, name): + """设置字体 + :param name: 字体名称,None表示恢复默认 + :return: None + """ + self.name = name + + def set_charset(self, charset): + """设置编码 + :param charset: 字体编码,int格式,None表示恢复默认 + :return: None + """ + if not isinstance(charset, int): + raise TypeError('charset参数只能接收int类型。') + self.charset = charset + + def set_size(self, size): + """设置字体大小 + :param size: 字体大小,None表示恢复默认 + :return: None + """ + self.size = size + + def set_bold(self, on_off): + """设置是否加粗 + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.bold = on_off + + def set_italic(self, on_off): + """设置是否斜体 + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.italic = on_off + + def set_strike(self, on_off): + """设置是否有删除线 + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.strike = on_off + + def set_outline(self, on_off): + """设置outline + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.outline = on_off + + def set_shadow(self, on_off): + """设置是否有阴影 + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.shadow = on_off + + def set_condense(self, on_off): + """设置condense + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.condense = on_off + + def set_extend(self, on_off): + """设置extend + :param on_off: bool表示开关,None表示恢复默认 + :return: None + """ + self.extend = on_off + + def set_color(self, color): + """设置字体颜色 + :param color: 字体颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + self.color = get_color_code(color) + + def set_underline(self, option): + """设置下划线 + :param option: 下划线类型,可选 'single', 'double', 'singleAccounting', 'doubleAccounting',None表示恢复默认 + :return: None + """ + if option not in self._LINE_STYLES: + raise ValueError(f'option参数只能是{self._LINE_STYLES}其中之一。') + self.underline = option + + def set_vertAlign(self, option): + """设置上下标 + :param option: 可选 'superscript', 'subscript', 'baseline',None表示恢复默认 + :return: None + """ + if option not in self._VERT_ALIGNS: + raise ValueError(f'option参数只能是{self._VERT_ALIGNS}其中之一。') + self.vertAlign = option + + def set_scheme(self, option): + """设置scheme + :param option: 可选 'major', 'minor',None表示恢复默认 + :return: None + """ + if option not in self._SCHEMES: + raise ValueError(f'option参数只能是{self._SCHEMES}其中之一。') + self.scheme = option + + +class CellBorder(object): + _LINE_STYLES = ('dashDot', 'dashDotDot', 'dashed', 'dotted', 'double', 'hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin', None) + + def __init__(self): + self.start = 'notSet' + self.end = 'notSet' + self.left = 'notSet' + self.right = 'notSet' + self.top = 'notSet' + self.bottom = 'notSet' + self.diagonal = 'notSet' + self.vertical = 'notSet' + self.horizontal = 'notSet' + self.horizontal = 'notSet' + self.outline = 'notSet' + self.diagonalUp = 'notSet' + self.diagonalDown = 'notSet' + + def set_start(self, style, color): + """设置start + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.start = Side(style=style, color=get_color_code(color)) + + def set_end(self, style, color): + """设置end + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.end = Side(style=style, color=get_color_code(color)) + + def set_left(self, style, color): + """设置左边框 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.left = Side(style=style, color=get_color_code(color)) + + def set_right(self, style, color): + """设置右边框 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.right = Side(style=style, color=get_color_code(color)) + + def set_top(self, style, color): + """设置上边框 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.top = Side(style=style, color=get_color_code(color)) + + def set_bottom(self, style, color): + """设置下边框 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.bottom = Side(style=style, color=get_color_code(color)) + + def set_diagonal(self, style, color): + """设置对角线 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.diagonal = Side(style=style, color=get_color_code(color)) + + def set_vertical(self, style, color): + """设置垂直中线 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.vertical = Side(style=style, color=get_color_code(color)) + + def set_horizontal(self, style, color): + """设置水平中线 + :param style: 线形,'dashDot','dashDotDot', 'dashed','dotted', 'double','hair', 'medium', 'mediumDashDot', + 'mediumDashDotDot', 'mediumDashed', 'slantDashDot', 'thick', 'thin',None表示恢复默认 + :param color: 边框颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + if style not in self._LINE_STYLES: + raise ValueError(f'style参数只能是{self._LINE_STYLES}之一。') + self.horizontal = Side(style=style, color=get_color_code(color)) + + def set_outline(self, on_off): + """ + :param on_off: bool表示开关 + :return: None + """ + self.outline = on_off + + def set_diagonalDown(self, on_off): + """ + :param on_off: bool表示开关 + :return: None + """ + self.diagonalDown = on_off + + def set_diagonalUp(self, on_off): + """ + :param on_off: bool表示开关 + :return: None + """ + self.diagonalUp = on_off + + +class CellAlignment(object): + _horizontal_alignments = ('general', 'left', 'center', 'right', 'fill', 'justify', 'centerContinuous', + 'distributed', None) + _vertical_alignments = ('top', 'center', 'bottom', 'justify', 'distributed', None) + + def __init__(self): + self.horizontal = 'notSet' + self.vertical = 'notSet' + self.indent = 'notSet' + self.relativeIndent = 'notSet' + self.justifyLastLine = 'notSet' + self.readingOrder = 'notSet' + self.textRotation = 'notSet' + self.wrapText = 'notSet' + self.shrinkToFit = 'notSet' + + def set_horizontal(self, horizontal): + """设置水平位置 + :param horizontal: 可选:'general', 'left', 'center', 'right', 'fill', 'justify', 'centerContinuous', + 'distributed',None表示恢复默认 + :return: None + """ + if horizontal not in self._horizontal_alignments: + raise ValueError(f'horizontal参数必须是{self._horizontal_alignments}其中之一。') + self.horizontal = horizontal + + def set_vertical(self, vertical): + """设置垂直位置 + :param vertical: 可选:'top', 'center', 'bottom', 'justify', 'distributed',None表示恢复默认 + :return: None + """ + if vertical not in self._vertical_alignments: + raise ValueError(f'horizontal参数必须是{self._vertical_alignments}其中之一。') + self.vertical = vertical + + def set_indent(self, indent): + """设置缩进 + :param indent: 缩进数值,0到255 + :return: None + """ + if not (isinstance(indent, int) and 0 <= indent <= 255): + raise ValueError('value参数必须在0到255之间。') + self.indent = indent + + def set_relativeIndent(self, indent): + """设置相对缩进 + :param indent: 缩进数值,-255到255 + :return: None + """ + if not (isinstance(indent, int) and -255 <= indent <= 255): + raise ValueError('value参数必须在-255到255之间。') + self.relativeIndent = indent + + def set_justifyLastLine(self, on_off): + """设置justifyLastLine + :param on_off: bool表示开或关,None表示恢复默认 + :return: None + """ + self.justifyLastLine = on_off + + def set_readingOrder(self, value): + """设置readingOrder + :param value: 不小于0的数字 + :return: None + """ + if not (isinstance(value, int) and value >= 0): + raise ValueError('value参数必须不小于0。') + self.readingOrder = value + + def set_textRotation(self, value): + """设置文本旋转角度 + :param value: 0-180或255 + :return: None + """ + if not (0 <= value <= 180 or value == 255): + raise ValueError('value必须在0到180之间。') + self.textRotation = value + + def set_wrapText(self, on_off): + """设置wrapText + :param on_off: bool表示开或关,None表示恢复默认 + :return: None + """ + self.wrapText = on_off + + def set_shrinkToFit(self, on_off): + """设置shrinkToFit + :param on_off: bool表示开或关,None表示恢复默认 + :return: None + """ + self.shrinkToFit = on_off + + +class CellGradientFill(object): + def __init__(self): + self.type = 'notSet' + self.degree = 'notSet' + self.left = 'notSet' + self.right = 'notSet' + self.top = 'notSet' + self.bottom = 'notSet' + self.stop = 'notSet' + + def set_type(self, name): + """设置类型 + :param name: 可选:'linear', 'path' + :return: None + """ + if name not in ('linear', 'path'): + raise ValueError("name参数只能是 'linear', 'path' 之一。") + self.type = name + + def set_degree(self, value): + """设置程度 + :param value: 数值 + :return: None + """ + self.degree = value + + def set_left(self, value): + """设置left + :param value: 数值 + :return: None + """ + self.left = value + + def set_right(self, value): + """设置right + :param value: 数值 + :return: None + """ + self.right = value + + def set_top(self, value): + """设置top + :param value: 数值 + :return: None + """ + self.top = value + + def set_bottom(self, value): + """设置bottom + :param value: 数值 + :return: None + """ + self.bottom = value + + def set_stop(self, values): + """设置stop + :param values: 数值 + :return: None + """ + self.stop = values + + +class CellPatternFill(object): + _FILES = ('none', 'solid', 'darkDown', 'darkGray', 'darkGrid', 'darkHorizontal', 'darkTrellis', 'darkUp', + 'darkVertical', 'gray0625', 'gray125', 'lightDown', 'lightGray', 'lightGrid', 'lightHorizontal', + 'lightTrellis', 'lightUp', 'lightVertical', 'mediumGray', None) + + def __init__(self): + self.patternType = 'notSet' + self.fgColor = 'notSet' + self.bgColor = 'notSet' + + def set_patternType(self, name): + """设置类型 + :param name: 可选:'none', 'solid', 'darkDown', 'darkGray', 'darkGrid', 'darkHorizontal', 'darkTrellis', + 'darkUp', 'darkVertical', 'gray0625', 'gray125', 'lightDown', 'lightGray', 'lightGrid', + 'lightHorizontal', 'lightTrellis', 'lightUp', 'lightVertical', 'mediumGray',None为恢复默认 + :return: None + """ + if name not in self._FILES: + raise ValueError(f'name参数只能是{self._FILES}其中之一。') + self.patternType = name + + def set_fgColor(self, color): + """设置前景色 + :param color: 颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + self.fgColor = get_color_code(color) + + def set_bgColor(self, color): + """设置背景色 + :param color: 颜色,格式:'FFFFFF', '255,255,255', (255, 255, 255), Color对象均可,None表示恢复默认 + :return: None + """ + self.bgColor = get_color_code(color) + + +class CellNumberFormat(object): + def __init__(self): + self.format = 'notSet' + + def set_format(self, string): + """设置数字格式 + :param string: 格式字符串,为None时恢复默认,格式很多具体在`openpyxl.numbers`查看 + :return: None + """ + if string is None: + string = 'General' + self.format = string + + +class CellProtection(object): + def __init__(self): + self.hidden = 'notSet' + self.locked = 'notSet' + + def set_hidden(self, on_off): + """设置是否隐藏 + :param on_off: bool表示开关 + :return: None + """ + self.hidden = on_off + + def set_locked(self, on_off): + """设置是否锁定 + :param on_off: bool表示开关 + :return: None + """ + self.locked = on_off + + +class CellStyleCopier(object): + def __init__(self, from_cell): + """ + :param from_cell: 被复制单元格对象 + """ + self._style = copy(from_cell._style) + self._font = copy(from_cell.font) + self._border = copy(from_cell.border) + self._fill = copy(from_cell.fill) + self._number_format = copy(from_cell.number_format) + self._protection = copy(from_cell.protection) + self._alignment = copy(from_cell.alignment) + + def to_cell(self, cell): + """把当前样式复制到目标单元格 + :param cell: 被设置样式的单元格对象 + :return: None + """ + cell._style = self._style + cell.alignment = self._alignment + cell.font = self._font + cell.border = self._border + cell.fill = self._fill + cell.number_format = self._number_format + cell.protection = self._protection + + +def get_color_code(color): + """将颜色拼音转为代码 + :param color: 颜色名称或代码字符串 + :return: 颜色代码 + """ + if color is None: + return '000000' + if isinstance(color, Color): + return color + __COLORS__ = { + 'white': 'FFFFFF', + 'black': '000000', + 'red': 'FF0000', + 'green': '7FB80E', + 'blue': '009AD6', + 'purple': '8552A1', + 'yellow': 'FFFF00', + 'orange': 'F58220' + } + color = str(color) + if ',' in color: + color = color.replace(' ', '').lstrip('(').rstrip(')') + RGB = color.split(',') + color = '' + for i in RGB: + num = int(i) + color += str(hex(num))[-2:].replace('x', '0').upper() + return color + + return __COLORS__.get(color, color).lstrip('#') + + +class NoneStyle(object): + _instance_lock = Lock() + + def __init__(self): + self._font = Font() + self._border = Border() + self._alignment = Alignment() + self._fill = PatternFill() + self._number_format = 'General' + self._protection = Protection() + + def __new__(cls, *args, **kwargs): + if not hasattr(NoneStyle, "_instance"): + with NoneStyle._instance_lock: + if not hasattr(NoneStyle, "_instance"): + NoneStyle._instance = object.__new__(cls) + return NoneStyle._instance + + def to_cell(self, cell, replace=True): + cell.font = self._font + cell.border = self._border + cell.alignment = self._alignment + cell.fill = self._fill + cell.protection = self._protection + cell.number_format = 'General' diff --git a/src/flaresolverr/DataRecorder/tools.py b/src/flaresolverr/DataRecorder/tools.py new file mode 100644 index 0000000000..af17051c19 --- /dev/null +++ b/src/flaresolverr/DataRecorder/tools.py @@ -0,0 +1,506 @@ +# -*- coding:utf-8 -*- +from csv import reader as csv_reader, writer as csv_writer +from pathlib import Path +from re import search, sub, match + +from openpyxl.cell import Cell, ReadOnlyCell +from openpyxl.reader.excel import load_workbook +from openpyxl.utils import column_index_from_string +from openpyxl.workbook import Workbook + + +def align_csv(path, encoding='utf-8', delimiter=',', quotechar='"'): + """补全csv文件,使其每行列数一样多,用于pandas读取时避免出错 + :param path: 要处理的文件路径 + :param encoding: 文件编码 + :param delimiter: 分隔符 + :param quotechar: 引用符 + :return: None + """ + with open(path, 'r', encoding=encoding) as f: + reader = csv_reader(f, delimiter=delimiter, quotechar=quotechar) + lines = list(reader) + lines_data = {} + max_len = 0 + + # 把每行列数用字典记录,并找到最长的一行 + for k, i in enumerate(lines): + line_len = len(i) + if line_len > max_len: + max_len = line_len + lines_data[k] = line_len + + # 把所有行用空值补全到和最长一行一样 + for i in lines_data: + lines[i].extend([None] * (max_len - lines_data[i])) + + writer = csv_writer(open(path, 'w', encoding=encoding, newline=''), delimiter=delimiter, quotechar=quotechar) + writer.writerows(lines) + + +def get_usable_path(path, is_file=True, parents=True): + """检查文件或文件夹是否有重名,并返回可以使用的路径 + :param path: 文件或文件夹路径 + :param is_file: 目标是文件还是文件夹 + :param parents: 是否创建目标路径 + :return: 可用的路径,Path对象 + """ + path = Path(path) + parent = path.parent + if parents: + parent.mkdir(parents=True, exist_ok=True) + path = parent / make_valid_name(path.name) + name = path.stem if path.is_file() else path.name + ext = path.suffix if path.is_file() else '' + + first_time = True + + while path.exists() and path.is_file() == is_file: + r = search(r'(.*)_(\d+)$', name) + + if not r or (r and first_time): + src_name, num = name, '1' + else: + src_name, num = r.group(1), int(r.group(2)) + 1 + + name = f'{src_name}_{num}' + path = parent / f'{name}{ext}' + first_time = None + + return path + + +def make_valid_name(full_name): + """获取有效的文件名 + :param full_name: 文件名 + :return: 可用的文件名 + """ + # ----------------去除前后空格---------------- + full_name = full_name.strip() + + # ----------------使总长度不大于255个字符(一个汉字是2个字符)---------------- + r = search(r'(.*)(\.[^.]+$)', full_name) # 拆分文件名和后缀名 + if r: + name, ext = r.group(1), r.group(2) + ext_long = len(ext) + else: + name, ext = full_name, '' + ext_long = 0 + + while get_long(name) > 255 - ext_long: + name = name[:-1] + + full_name = f'{name}{ext}'.rstrip('.') + + # ----------------去除不允许存在的字符---------------- + return sub(r'[<>/\\|:*?\n]', '', full_name) + + +def get_long(txt): + """返回字符串中字符个数(一个汉字是2个字符) + :param txt: 字符串 + :return: 字符个数 + """ + txt_len = len(txt) + return int((len(txt.encode('utf-8')) - txt_len) / 2 + txt_len) + + +def parse_coord(coord=None, data_col=None): + """添加数据,每次添加一行数据,可指定坐标、列号或行号 + coord只输入数字(行号)时,列号为self.data_col值,如 3; + 输入列号,或没有行号的坐标时,表示新增一行,列号为此时指定的,如'c'、',3'、(None, 3)、'None,3'; + 输入 'newline' 时,表示新增一行,列号为self.data_col值; + 输入行列坐标时,填写到该坐标,如'a3'、'3,1'、(3,1)、[3,1]; + 输入的行号可以是负数(列号不可以),代表从下往上数,-1是倒数第一行,如'a-3'、(-3, 3) + :param coord: 坐标、列号、行号 + :param data_col: 列号,用于只传入行号的情况 + :return: 坐标tuple:(行, 列),或(None, 列) + """ + return_coord = None + if coord == 'newline': # 新增一行,列为data_col + return_coord = None, data_col + + elif isinstance(coord, (int, float)) and coord != 0: + return_coord = int(coord), data_col + + elif isinstance(coord, str): + coord = coord.replace(' ', '') + + if coord.isalpha(): # 只输入列号,要新建一行 + return_coord = None, column_index_from_string(coord) + + elif ',' in coord: # '3,1'形式 + x, y = coord.split(',') + if x.lower() in ('', 'new', 'none', 'newline'): + x = None + elif x.isdigit(): + x = int(x) + else: + raise ValueError('行格式不正确。') + + if y.isdigit(): + y = int(y) + elif y.isalpha(): + y = column_index_from_string(y) + else: + raise TypeError('列格式不正确。') + + return_coord = x, y + + else: # 'A3'或'3A'形式 + m = match(r'^[$]?([A-Za-z]{1,3})[$]?(-?\d+)$', coord) + if m: + y, x = m.groups() + return_coord = int(x), column_index_from_string(y) + + else: + m = match(r'^[$]?(-?\d+)[$]?([A-Za-z]{1,3})$', coord) + if not m: + raise ValueError(f'{coord} 坐标格式不正确。') + x, y = m.groups() + return_coord = int(x), column_index_from_string(y) + + elif isinstance(coord, (tuple, list)): + if len(coord) != 2: + raise ValueError('coord为list或tuple时长度必须为2。') + + x = None + if coord[0] not in (None, 'new', 'newline'): + x = int(coord[0]) + + if isinstance(coord[1], int): + y = coord[1] + elif isinstance(coord[1], str): + y = column_index_from_string(coord[1]) + else: + raise TypeError('列格式不正确。') + + return_coord = x, y + + if not return_coord or return_coord[0] == 0 or return_coord[1] == 0: + raise ValueError(f'{return_coord} 坐标格式不正确。') + return return_coord + + +def process_content(content, excel=False): + """处理单个单元格要写入的数据 + :param content: 未处理的数据内容 + :param excel: 是否为excel文件 + :return: 处理后的数据 + """ + if isinstance(content, (int, str, float, type(None))): + data = content + elif isinstance(content, (Cell, ReadOnlyCell)): + data = content.value + else: + data = str(content) + + if excel and isinstance(data, str): + data = sub(r'[\000-\010]|[\013-\014]|[\016-\037]', '', data) + + return data + + +def ok_list(data_list, excel=False, as_str=False): + """处理列表中数据使其符合保存规范 + :param data_list: 数据列表 + :param excel: 是否保存在excel + :param as_str: 内容是否转为字符串 + :return: 处理后的列表 + """ + if isinstance(data_list, dict): + data_list = data_list.values() + if as_str: + data_list = [str(i) for i in data_list] + return [process_content(i, excel) for i in data_list] + + +def get_usable_coord_int(coord, max_row, max_col): + """返回真正写入文件的坐标 + :param coord: 已初步格式化的坐标,如(1, 2)、(None, 3)、(-3, -2) + :param max_row: 文件最大行 + :param max_col: 文件最大列 + :return: 真正写入文件的坐标,tuple格式 + """ + row, col = coord + if col < 0: + col = max_col + col + 1 + if col < 1: + raise ValueError(f'列号不能小于1。当前:{col}') + + if row is None: + row = max_row + 1 + elif row < 0: + row = max_row + row + 1 + if row < 1: + raise ValueError(f'行号不能小于1。当前:{row}') + + return row, col + + +def get_usable_coord(coord, max_row, ws): + """返回真正写入文件的坐标 + :param coord: 已初步格式化的坐标,如(1, 2)、(None, 3)、(-3, -2) + :param max_row: 文件最大行 + :param ws: Worksheet对象 + :return: 真正写入文件的坐标,tuple格式 + """ + row, col = coord + if col < 0: + col = ws.max_column + col + 1 + if col < 1: + raise ValueError(f'列号不能小于1。当前:{col}') + + if row is None: + row = max_row + 1 + elif row < 0: + row = max_row + row + 1 + if row < 1: + raise ValueError(f'行号不能小于1。当前:{row}') + + return row, col + + +def data_to_list_or_dict_simplify(data): + """将传入的数据转换为列表或字典形式,不添加前后列数据 + :param data: 要处理的数据 + :return: 转变成列表或字典形式的数据 + """ + if data is None: + data = tuple() + elif not isinstance(data, (list, tuple, dict)): + data = (data,) + return data + + +def data_to_list_or_dict(recorder, data): + """将传入的数据转换为列表或字典形式,添加前后列数据 + :param recorder: BaseRecorder对象 + :param data: 要处理的数据 + :return: 转变成列表或字典形式的数据 + """ + if data is None: + data = tuple() + + elif not isinstance(data, (list, tuple, dict)): + data = (data,) + + if not (recorder.before or recorder.after): + return data + + if isinstance(data, (list, tuple)): + return_list = [] + for i in (recorder.before, data, recorder.after): + if isinstance(i, dict): + return_list.extend(list(i.values())) + elif i is None: + pass + elif isinstance(i, list): + return_list.extend(i) + elif isinstance(i, tuple): + return_list.extend(list(i)) + else: + return_list.extend([str(i)]) + + return return_list + + elif isinstance(data, dict): + if not recorder.before: + pass + elif isinstance(recorder.before, dict): + data = {**recorder.before, **data} + elif isinstance(recorder.before, (list, tuple)): + data1 = list(recorder.before) + data1.extend(data.values()) + data = data1 + + if not recorder.after: + return data + + elif isinstance(data, dict): + if isinstance(recorder.after, dict): + data = {**data, **recorder.after} + elif isinstance(recorder.after, (list, tuple)): + data = list(data) + data.extend(recorder.after) + + elif isinstance(data, list): + if isinstance(recorder.after, dict): + data.extend(recorder.after.values()) + elif isinstance(recorder.after, (list, tuple)): + data.extend(recorder.after) + + return data + + +def get_csv_head(recorder, is_filler=False): + """在写入数据时,先获取表头,如果文件不存在就新建,如果空文件且数据为dict,自动增加表头""" + new = False + add_head = False + if recorder._file_exists or Path(recorder.path).exists(): + from csv import reader + with open(recorder.path, 'r', newline='', encoding=recorder.encoding) as f: + u = reader(f, delimiter=recorder.delimiter, quotechar=recorder.quote_char) + try: + head = next(u) + if not head or not any([i for i in head]): + head = False # 有表头行,但表头行是空的 + except StopIteration: # 文件是空的 + if is_filler: + coord, first_data = recorder._data[0] + if coord != (None, True): + recorder._head = False + return + else: + first_data = first_data[0] + else: + first_data = recorder._data[0] + + if isinstance(first_data, dict): + first_data = first_data.keys() + new = True + add_head = True + head = ok_list(first_data) + + else: + new = True + if not is_filler: + first_data = recorder._data[0] + else: + coord, first_data = recorder._data[0] + if coord == (None, True): + first_data = first_data[0] + else: + first_data = None + + if isinstance(first_data, dict): + first_data = first_data.keys() + new = True + add_head = True + + head = ok_list(first_data) if first_data else False + + if new: + with open(recorder.path, 'w', newline='', encoding=recorder.encoding) as f: + if add_head: + from csv import writer + csv_write = writer(f, delimiter=recorder.delimiter, quotechar=recorder.quote_char) + csv_write.writerow(head) + + recorder._file_exists = True + recorder._head = remove_list_end_Nones(head) + + +def get_xlsx_head(recorder, new_file, new_sheet, first_data, ws, is_filler=False): + """获取xlsx文件的表头,如果是空的旧文件,插入表头 + :return: tuple:(开始行号, 是否新插入的表头) + """ + new_head = False + if recorder._head.get(ws.title, None) is not None: + return 0, False + + begin = 0 + if is_filler: + coord, first_data = first_data + first_data = first_data[0] + if coord != (None, True): + coord = None + else: + coord = True + + if new_file or new_sheet: + if not coord: + head = False + elif isinstance(first_data, dict): + head = ok_list(list(first_data.keys()), True) + ws.append(head) + else: + head = ok_list(first_data, True) + + elif not any([i.value for i in ws[1]]) and ws.max_row == 1: # 空旧文件 + if not coord: + head = False + else: + if not isinstance(first_data, dict): + begin = 1 + head = ok_list(first_data, True) + for n, i in enumerate(head, 1): + ws.cell(1, n).value = i + new_head = True + + else: # 旧有内容的sheet + head = recorder._head.get(ws.title, None) + if not head: + head = [i.value for i in ws[1]] + + if not head: + recorder._head[ws.title] = False + else: + recorder._head[ws.title] = remove_list_end_Nones(head) + + return begin, new_head + + +def create_csv(recorder): + if not Path(recorder.path).exists(): + with open(recorder.path, 'w', newline='', encoding=recorder.encoding): + pass + recorder._file_exists = True + + +def get_wb(recorder): + if recorder._file_exists or Path(recorder.path).exists(): + wb = load_workbook(recorder.path) + new_file = False + else: + wb = Workbook() + new_file = True + recorder._file_exists = True + return wb, new_file + + +def get_ws(wb, table, tables, new_file): + new_sheet = False + if table is None: + ws = wb.active + + elif table in tables: + ws = wb[table] + + elif new_file is True: + ws = wb.active + tables.remove(ws.title) + ws.title = table + tables.append(table) + new_sheet = True + + else: + ws = wb.create_sheet(title=table) + tables.append(table) + new_sheet = True + + return ws, new_sheet + + +def remove_list_end_Nones(in_list): + """去除列表后面所有None + :param in_list: 要处理的list + """ + h = [] + flag = True + for i in in_list[::-1]: + if i is None: + if flag: + continue + else: + flag = False + h.append(i) + return h[::-1] + + +class FillerDict(dict): + pass + + +class FillerList(list): + pass \ No newline at end of file diff --git a/src/flaresolverr/DataRecorder/tools.pyi b/src/flaresolverr/DataRecorder/tools.pyi new file mode 100644 index 0000000000..507c1add71 --- /dev/null +++ b/src/flaresolverr/DataRecorder/tools.pyi @@ -0,0 +1,71 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from typing import Union, Tuple, Any, Optional + +from openpyxl.worksheet.worksheet import Worksheet + +from .base import BaseRecorder +from .filler import Filler +from .recorder import Recorder + + +def align_csv(path: Union[str, Path], encoding: str = 'utf-8', delimiter: str = ',', quotechar: str = '"') -> None: ... + + +def get_usable_path(path: Union[str, Path], is_file: bool = True, parents: bool = True) -> Path: ... + + +def make_valid_name(full_name: str) -> str: ... + + +def get_long(txt) -> int: ... + + +def parse_coord(coord: Union[int, str, list, tuple, None] = None, data_col: int = None) -> Tuple[ + Optional[int], int]: ... + + +def process_content(content: Any, excel: bool = False) -> Union[None, int, str, float]: ... + + +def ok_list(data_list: Union[list, dict], excel: bool = False, as_str: bool = False) -> list: ... + + +def get_usable_coord_int(coord: Union[tuple, list], + max_row: int, + max_col: Union[int, Worksheet]) -> Tuple[int, int]: ... + + +def get_usable_coord(coord: Union[tuple, list], + max_row: int, + ws: Worksheet) -> Tuple[int, int]: ... + + +def data_to_list_or_dict_simplify(data: Union[list, tuple, dict, None]) -> Union[list, dict]: ... + + +def data_to_list_or_dict(recorder: BaseRecorder, data: Union[list, tuple, dict, None]) -> Union[list, dict]: ... + + +def get_csv_head(recorder: Union[Recorder, Filler], is_filler: bool = False) -> Optional[list]: ... + + +def get_xlsx_head(recorder: Union[Recorder, Filler], new_file: bool, new_sheet: bool, + first_data: Union[dict, list, tuple], ws: Worksheet, is_filler: bool = False) -> Tuple[int, bool]: ... + + +def create_csv(recorder: Union[Recorder, Filler]) -> None: ... + + +def get_wb(recorder: Union[Recorder, Filler]) -> tuple: ... + + +def get_ws(wb, table, tables, new_file) -> Tuple[Worksheet, bool]: ... + + +class FillerDict(dict): + row: int = ... + + +class FillerList(list): + row: int = ... diff --git a/src/flaresolverr/DownloadKit/__init__.py b/src/flaresolverr/DownloadKit/__init__.py new file mode 100644 index 0000000000..154b604edd --- /dev/null +++ b/src/flaresolverr/DownloadKit/__init__.py @@ -0,0 +1,4 @@ +# -*- coding:utf-8 -*- +from .downloadKit import DownloadKit + +__version__ = '2.0.2' diff --git a/src/flaresolverr/DownloadKit/_funcs.py b/src/flaresolverr/DownloadKit/_funcs.py new file mode 100644 index 0000000000..60fe885a32 --- /dev/null +++ b/src/flaresolverr/DownloadKit/_funcs.py @@ -0,0 +1,264 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +""" +from copy import copy +from os import path as os_PATH +from pathlib import Path +from random import randint +from re import search, sub +from time import time +from urllib.parse import unquote + +from ..DataRecorder.tools import get_usable_path, make_valid_name +from requests import Session + +FILE_EXISTS_MODE = {'rename': 'rename', 'overwrite': 'overwrite', 'skip': 'skip', 'add': 'add', 'r': 'rename', + 'o': 'overwrite', 's': 'skip', 'a': 'add'} + + +def copy_session(session): + """复制输入Session对象,返回一个新的 + :param session: 被复制的Session对象 + :return: 新Session对象 + """ + new = Session() + new.headers = session.headers.copy() + new.cookies = session.cookies.copy() + new.stream = True + new.auth = session.auth + new.proxies = dict(session.proxies).copy() + new.params = copy(session.params) # + new.cert = session.cert + new.max_redirects = session.max_redirects + new.trust_env = session.trust_env + new.verify = session.verify + + return new + + +class BlockSizeSetter(object): + def __set__(self, block_size, val): + if isinstance(val, int) and val > 0: + size = val + elif isinstance(val, str): + units = {'b': 1, 'k': 1024, 'm': 1048576, 'g': 21474836480} + num = int(val[:-1]) + unit = units.get(val[-1].lower(), None) + if unit and num > 0: + size = num * unit + else: + raise ValueError('单位只支持B、K、M、G,数字必须为大于0的整数。') + else: + raise TypeError('split_size只能传入int或str,数字必须为大于0的整数。') + + block_size._block_size = size + + def __get__(self, block_size, objtype=None) -> int: + return block_size._block_size + + +class PathSetter(object): + def __set__(self, goal_path, val): + if val is not None and not isinstance(val, (str, Path)): + raise TypeError('goal_path只能是str或Path类型。') + goal_path._goal_path = str(val) if isinstance(val, Path) else val + + def __get__(self, goal_path, objtype=None): + return goal_path._goal_path + + +class FileExistsSetter(object): + def __set__(self, file_exists, mode): + file_exists._file_exists = get_file_exists_mode(mode) + + def __get__(self, file_exists, objtype=None): + return file_exists._file_exists + + +def get_file_exists_mode(mode): + """获取文件重名时处理策略名称 + :param mode: 输入 + :return: 标准字符串 + """ + mode = FILE_EXISTS_MODE.get(mode, mode) + if mode not in FILE_EXISTS_MODE: + raise ValueError(f'''mode参数只能是 '{"', '".join(FILE_EXISTS_MODE.keys())}' 之一,现在是:{mode}''') + return mode + + +def set_charset(response, encoding): + """设置Response对象的编码 + :param response: Response对象 + :param encoding: 指定的编码格式 + :return: 设置编码后的Response对象 + """ + if encoding: + response.encoding = encoding + return response + + # 在headers中获取编码 + content_type = response.headers.get('content-type', '').lower() + if not content_type.endswith(';'): + content_type += ';' + charset = search(r'charset[=: ]*(.*)?;?', content_type) + + if charset: + response.encoding = charset.group(1) + + # 在headers中获取不到编码,且如果是网页 + elif content_type.replace(' ', '').startswith('text/html'): + re_result = search(b']+).*?>', response.content) + + if re_result: + charset = re_result.group(1).decode() + else: + charset = response.apparent_encoding + + response.encoding = charset + + return response + + +def get_file_info(response, goal_path=None, rename=None, suffix=None, file_exists=None, encoding=None, lock=None): + """获取文件信息,大小单位为byte + 包括:size、path、skip + :param response: Response对象 + :param goal_path: 目标文件夹 + :param rename: 重命名 + :param suffix: 重命名后缀名 + :param file_exists: 存在重名文件时的处理方式 + :param encoding: 编码格式 + :param lock: 线程锁 + :return: 文件名、文件大小、保存路径、是否跳过 + """ + # ------------获取文件大小------------ + file_size = response.headers.get('Content-Length', None) + file_size = None if file_size is None else int(file_size) + + # ------------获取网络文件名------------ + file_name = _get_file_name(response, encoding) + + # ------------获取保存路径------------ + goal_Path = Path(goal_path) + # 按windows规则去除路径中的非法字符 + g = goal_path[len(goal_Path.anchor):] if goal_path.lower().startswith(goal_Path.anchor.lower()) else goal_path + goal_path = goal_Path.anchor + sub(r'[*:|<>?"]', '', g).strip() + goal_Path = Path(goal_path).absolute() + goal_Path.mkdir(parents=True, exist_ok=True) + + # ------------获取保存文件名------------ + # -------------------重命名------------------- + if rename: + if suffix is not None: + full_name = f'{rename}.{suffix}' if suffix else rename + + else: + tmp = file_name.rsplit('.', 1) + ext_name = f'.{tmp[-1]}' if len(tmp) > 1 else '' + tmp = rename.rsplit('.', 1) + ext_rename = f'.{tmp[-1]}' if len(tmp) > 1 else '' + full_name = rename if ext_rename == ext_name else f'{rename}{ext_name}' + + elif suffix is not None: + full_name = file_name.rsplit(".", 1)[0] + if suffix: + full_name = f'{full_name}.{suffix}' + + else: + full_name = file_name + + full_name = make_valid_name(full_name) + + # -------------------生成路径------------------- + skip = False + create = True + full_path = goal_Path / full_name + + with lock: + if full_path.exists(): + if file_exists == 'rename': + full_path = get_usable_path(full_path) + + elif file_exists == 'skip': + skip = True + create = False + + elif file_exists == 'overwrite': + full_path.unlink() + + elif file_exists == 'add': + create = False + + if create: + with open(full_path, 'wb'): + pass + + return {'size': file_size, + 'path': full_path, + 'skip': skip} + + +def _get_file_name(response, encoding) -> str: + """从headers或url中获取文件名,如果获取不到,生成一个随机文件名 + :param response: 返回的response + :param encoding: 在headers获取时指定编码格式 + :return: 下载文件的文件名 + """ + file_name = '' + charset = '' + content_disposition = response.headers.get('content-disposition', '').replace(' ', '') + + # 使用header里的文件名 + if content_disposition: + txt = search(r'filename\*="?([^";]+)', content_disposition) + if txt: # 文件名自带编码方式 + txt = txt.group(1).split("''", 1) + if len(txt) == 2: + charset, file_name = txt + else: + file_name = txt[0] + + else: # 文件名没带编码方式 + txt = search(r'filename="?([^";]+)', content_disposition) + if txt: + file_name = txt.group(1) + + # 获取编码(如有) + charset = encoding or response.encoding + + file_name = file_name.strip("'") + + # 在url里获取文件名 + if not file_name and os_PATH.basename(response.url): + file_name = os_PATH.basename(response.url).split("?")[0] + + # 找不到则用时间和随机数生成文件名 + if not file_name: + file_name = f'untitled_{time()}_{randint(0, 100)}' + + # 去除非法字符 + charset = charset or 'utf-8' + return unquote(file_name, charset) + + +def set_session_cookies(session, cookies): + """设置Session对象的cookies + :param session: Session对象 + :param cookies: cookies信息 + :return: None + """ + # cookies = cookies_to_tuple(cookies) + for cookie in cookies: + if cookie['value'] is None: + cookie['value'] = '' + + kwargs = {x: cookie[x] for x in cookie + if x.lower() in ('version', 'port', 'domain', 'path', 'secure', + 'expires', 'discard', 'comment', 'comment_url', 'rest')} + + if 'expiry' in cookie: + kwargs['expires'] = cookie['expiry'] + + session.cookies.set(cookie['name'], cookie['value'], **kwargs) diff --git a/src/flaresolverr/DownloadKit/_funcs.pyi b/src/flaresolverr/DownloadKit/_funcs.pyi new file mode 100644 index 0000000000..2a2d1b7539 --- /dev/null +++ b/src/flaresolverr/DownloadKit/_funcs.pyi @@ -0,0 +1,46 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +""" +from pathlib import Path +from threading import Lock +from typing import Union, Optional + +from requests import Session, Response + +FILE_EXISTS_MODE: dict = ... + + +def copy_session(session: Session) -> Session: ... + + +class BlockSizeSetter(object): + def __set__(self, block_size, val: Union[str, int]): ... + + def __get__(self, block_size, objtype=None) -> int: ... + + +class PathSetter(object): + def __set__(self, goal_path, val: Union[str, Path]): ... + + def __get__(self, goal_path, objtype=None): ... + + +class FileExistsSetter(object): + def __set__(self, file_exists, mode: str): ... + + def __get__(self, file_exists, objtype=None): ... + + +def get_file_exists_mode(mode: str) -> str: ... + + +def set_charset(response: Response, encoding: Optional[str]) -> Response: ... + + +def get_file_info(response: Response, goal_path: str = None, rename: str = None, suffix: str = None, + file_exists: str = None, encoding: Optional[str] = None, lock: Lock = None) -> dict: ... + + +def set_session_cookies(session: Session, cookies: list) -> None: ... diff --git a/src/flaresolverr/DownloadKit/downloadKit.py b/src/flaresolverr/DownloadKit/downloadKit.py new file mode 100644 index 0000000000..055c52640d --- /dev/null +++ b/src/flaresolverr/DownloadKit/downloadKit.py @@ -0,0 +1,564 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@File : downloadKit.py +""" +from copy import copy +from pathlib import Path +from queue import Queue +from re import sub +from threading import Thread, Lock +from time import sleep, perf_counter + +from requests import Response +from requests.structures import CaseInsensitiveDict + +from ._funcs import FileExistsSetter, PathSetter, BlockSizeSetter, set_charset, get_file_info, get_file_exists_mode +from .mission import Task, Mission +from .setter import Setter + + +class DownloadKit(object): + file_exists = FileExistsSetter() + goal_path = PathSetter() + block_size = BlockSizeSetter() + + def __init__(self, goal_path=None, roads=10, session=None, file_exists='rename', driver=None): + """ + :param goal_path: 文件保存路径 + :param roads: 可同时运行的线程数 + :param file_exists: 有同名文件名时的处理方式,可选 'skip', 'overwrite', 'rename', 'add' + :param driver: 使用的Session对象,或配置对象、页面对象等 + """ + self._roads = roads + self._missions = {} + self._threads = {i: None for i in range(self._roads)} + self._waiting_list = Queue() + self._missions_num = 0 + self._running_count = 0 # 正在运行的任务数 + self._stop_printing = False # 用于控制显示线程停止 + self._lock = Lock() + self.page = None + self._retry = None + self._interval = None + self._timeout = None + self._encoding = None + + self._setter = None + self._print_mode = None + self._log_mode = None + self._logger = None + + self.goal_path = goal_path or '.' + self.file_exists = file_exists + self.split = True + self.block_size = '50M' + self.set.driver(session or driver) + + def __call__(self, file_url, goal_path=None, rename=None, suffix=None, file_exists=None, show_msg=True, **kwargs): + """以阻塞的方式下载一个文件并返回结果 + :param file_url: 文件网址 + :param goal_path: 保存路径 + :param rename: 重命名的文件名 + :param suffix: 指定后缀名 + :param file_exists: 遇到同名文件时的处理方式,可选 'skip', 'overwrite', 'rename', 'add',默认跟随实例属性 + :param show_msg: 是否打印进度 + :param kwargs: 连接参数 + :return: 任务结果和信息组成的tuple + """ + return self.download(file_url=file_url, goal_path=goal_path, rename=rename, suffix=suffix, + file_exists=file_exists, show_msg=show_msg, **kwargs) + + @property + def set(self): + """用于设置打印和记录模式的对象""" + if self._setter is None: + self._setter = Setter(self) + return self._setter + + @property + def roads(self): + """可同时运行的线程数""" + return self._roads + + @property + def retry(self): + """返回连接失败时重试次数""" + if self._retry is not None: + return self._retry + elif self.page is not None: + return self.page.retry_times + else: + return 3 + + @property + def interval(self): + """返回连接失败时重试间隔""" + if self._interval is not None: + return self._interval + elif self.page is not None: + return self.page.retry_interval + else: + return 5 + + @property + def timeout(self): + """返回连接超时时间""" + if self._timeout is not None: + return self._timeout + elif self.page is not None: + return self.page.timeout + else: + return 20 + + @property + def waiting_list(self): + """返回等待队列""" + return self._waiting_list + + @property + def session(self): + """返回用于保存默认连接设置的Session对象""" + return self._session + + @property + def is_running(self): + """返回是否有线程还在运行""" + return self._running_count > 0 + + @property + def missions(self): + """用list方式返回所有任务对象""" + return self._missions + + @property + def encoding(self): + """返回指定的编码格式""" + return self._encoding + + def add(self, file_url, goal_path=None, rename=None, suffix=None, file_exists=None, split=None, **kwargs): + """添加一个下载任务并将其返回 + :param file_url: 文件网址 + :param goal_path: 保存路径 + :param rename: 重命名的文件名 + :param suffix: 重命名的文件后缀名 + :param file_exists: 遇到同名文件时的处理方式,可选 'skip', 'overwrite', 'rename', 'add',默认跟随实例属性 + :param split: 是否允许多线程分块下载,为None则使用对象属性 + :param kwargs: 连接参数 + :return: 任务对象 + """ + self._missions_num += 1 + self._running_count += 1 + if file_exists is not None: + file_exists = get_file_exists_mode(file_exists) + mission = Mission(self._missions_num, self, file_url, str(goal_path or self.goal_path), rename, suffix, + file_exists or self.file_exists, self.split if split is None else split, self._encoding, + kwargs) + self._missions[self._missions_num] = mission + self._run_or_wait(mission) + return mission + + def download(self, file_url, goal_path=None, rename=None, suffix=None, file_exists=None, show_msg=True, **kwargs): + """以阻塞的方式下载一个文件并返回结果 + :param file_url: 文件网址 + :param goal_path: 保存路径 + :param rename: 重命名的文件名 + :param suffix: 重命名的文件后缀名 + :param file_exists: 遇到同名文件时的处理方式,可选 'skip', 'overwrite', 'rename', 'add',默认跟随实例属性 + :param show_msg: 是否打印进度 + :param kwargs: 连接参数 + :return: 任务结果和信息组成的tuple + """ + if show_msg: + tmp = self._print_mode + self._print_mode = None + r = self.add(file_url=file_url, goal_path=goal_path, rename=rename, suffix=suffix, file_exists=file_exists, + split=False, **kwargs).wait(show=show_msg) + if show_msg: + self._print_mode = tmp + return r + + def _run_or_wait(self, mission): + """接收任务,有空线程则运行,没有则进入等待队列 + :param mission: 任务对象 + :return: None + """ + thread_id = self._get_usable_thread() + if thread_id is not None: + thread = Thread(target=self._run, args=(thread_id, mission), daemon=False) + self._threads[thread_id] = {'thread': thread, 'mission': None} + thread.start() + else: + self._waiting_list.put(mission) + + def _run(self, ID, mission): + """线程函数 + :param ID: 线程id + :param mission: 任务对象,Mission或Task + :return: None + """ + while True: + if not mission: # 如果没有任务,就从等候列表中取一个 + if not self._waiting_list.empty(): + try: + mission = self._waiting_list.get(True, .5) + except Exception: + self._waiting_list.task_done() + break + else: + break + + self._threads[ID]['mission'] = mission + self._download(mission, ID) + mission = None + + self._threads[ID] = None + + def get_mission(self, mission_or_id): + """根据id值获取一个任务 + :param mission_or_id: 任务或任务id + :return: 任务对象 + """ + return self._missions[mission_or_id] if isinstance(mission_or_id, int) else mission_or_id + + def get_failed_missions(self): + """返回失败任务列表""" + return [i for i in self._missions.values() if i.result is False] + + def wait(self, mission=None, show=False, timeout=None): + """等待所有或指定任务完成 + :param mission: 任务对象或任务id,为None时等待所有任务结束 + :param show: 是否显示进度 + :param timeout: 超时时间,None或0为无限 + :return: 任务结果和信息组成的tuple + """ + timeout = 0 if timeout is None else timeout + if mission: + return self.get_mission(mission).wait(show, timeout) + + else: + if show: + self.show(False) + else: + end_time = perf_counter() + timeout + while self.is_running and (perf_counter() < end_time or timeout == 0): + sleep(0.1) + + def cancel(self): + """取消所有等待中或执行中的任务""" + for m in self._missions.values(): + m.cancel() + + def show(self, asyn=True, keep=False): + """实时显示所有线程进度 + :param asyn: 是否以异步方式显示 + :param keep: 任务列表为空时是否保持显示 + :return: None + """ + if asyn: + Thread(target=self._show, args=(2, keep)).start() + else: + self._show(0.1, keep) + + def _show(self, wait, keep=False): + """实时显示所有线程进度 + :param wait: 超时时间(秒) + :param keep: 任务列表为空时是否保持显示 + :return: None + """ + self._stop_printing = False + + if keep: + Thread(target=self._stop_show).start() + + end_time = perf_counter() + wait + while not self._stop_printing and (keep or self.is_running or perf_counter() < end_time): + print(f'\033[K', end='') + print(f'等待任务数:{self._waiting_list.qsize()}') + for k, v in self._threads.items(): + m = v['mission'] if v else None + if m: + items = (m.mission.rate, m.mid) if isinstance(m, Task) else (m.rate, m.id) + path = f'M{items[1]} {items[0]}% {m}' + else: + path = '空闲' + print(f'\033[K', end='') + print(f'线程{k}:{path}') + + print(f'\033[{self.roads + 1}A\r', end='') + sleep(0.4) + + print(f'\033[1B', end='') + for i in range(self.roads): + print(f'\033[K', end='') + print(f'线程{i}:空闲') + + print() + + def _connect(self, url, session, _headers, method, encoding, **kwargs): + """生成response对象 + :param url: 目标url + :param session: 用于连接的Session对象 + :param _headers: 内置的headers参数 + :param method: 请求方式 + :param encoding: 编码格式 + :param kwargs: 连接参数 + :return: tuple,第一位为Response或None,第二位为出错信息或'Success' + """ + if 'headers' in kwargs: # 不知道为什么添加这个才能正常使用多线程 + kwargs['headers'] = CaseInsensitiveDict({**_headers, **kwargs['headers']}) + else: + kwargs['headers'] = _headers + + r = err = None + for i in range(self.retry + 1): + try: + if method == 'get': + r = session.get(url, **kwargs) + elif method == 'post': + r = session.post(url, **kwargs) + + if r: + return set_charset(r, encoding), 'Success' + + except Exception as e: + err = e + + if r and r.status_code in (403, 404): + break + if i < self.retry: + sleep(self.interval) + + # 返回失败结果 + if r is None: + return None, '连接失败' if err is None else err + if not r.ok: + return r, f'状态码:{r.status_code}' + + def _get_usable_thread(self): + """获取可用线程,没有则返回None""" + for k, v in self._threads.items(): + if v is None: + return k + + def _stop_show(self): + """设置停止打印的变量""" + input() + self._stop_printing = True + + def _when_mission_done(self, mission): + """当任务完成时执行的操作 + :param mission: 完结的任务 + :return: None + """ + self._running_count -= 1 + if self._print_mode == 'all' or (self._print_mode == 'failed' and mission.result is False): + print(f'[{mission.RESULT_TEXTS[mission.result]}] {mission.data.url} {mission.info}') + + if self._log_mode == 'all' or (self._log_mode == 'failed' and mission.result is False): + data = ('下载结果', + mission.data.url, + mission.data.goal_path, + mission.data.rename, + mission.data.kwargs) + self._logger.add_data(data) + + mission.session.close() + + def _download(self, mission_or_task, thread_id): + """此方法是执行下载的线程方法,用于根据任务下载文件 + :param mission_or_task: 下载任务对象 + :param thread_id: 线程号 + :return: None + """ + if mission_or_task.is_done: + return + if mission_or_task.state == 'cancel': + mission_or_task.state = 'done' + return + + file_url = mission_or_task.data.url + + if isinstance(mission_or_task, Task): + kwargs = copy(mission_or_task.data.kwargs) + task = mission_or_task + kwargs['headers']['Range'] = f"bytes={task.range[0]}-{task.range[1]}" + r, inf = self._connect(file_url, task.mission.session, task.mission.headers, task.mission.method, + task.mission.encoding, **kwargs) + + if r: + _do_download(r, task, False) + else: + task._set_done(False, inf) + + return + + # ===================开始处理mission==================== + mission = mission_or_task + mission.info = '下载中' + mission.state = 'running' + kwargs = mission_or_task.data.kwargs + if self._print_mode == 'all': + print(f'开始下载:{mission.data.url}') + if self._log_mode == 'all': + self._logger.add_data(('开始下载', mission.data.url)) + + rename = mission.data.rename + suffix = mission.data.suffix + goal_path = mission.data.goal_path + file_exists = mission.data.file_exists + split = mission.data.split + + goal_Path = Path(goal_path) + # 按windows规则去除路径中的非法字符 + g = goal_path[len(goal_Path.anchor):] if goal_path.lower().startswith(goal_Path.anchor.lower()) else goal_path + goal_path = goal_Path.anchor + sub(r'[*:|<>?"]', '', g).strip() + goal_Path = Path(goal_path).absolute() + goal_Path.mkdir(parents=True, exist_ok=True) + goal_path = str(goal_Path) + + if file_exists == 'skip' and rename: + tmp = goal_Path / rename + if tmp.exists() and tmp.is_file(): + mission.file_name = rename + mission._set_path(goal_Path / rename) + mission._set_done('skipped', str(mission.path)) + return + + r, inf = self._connect(file_url, mission.session, mission.headers, mission.method, mission.encoding, **kwargs) + + if mission.is_done: + return + + if not r: + mission._break_mission(result=False, info=inf) + return + + # -------------------获取文件信息------------------- + file_info = get_file_info(r, goal_path, rename, suffix, file_exists, mission.encoding, self._lock) + file_size = file_info['size'] + full_path = file_info['path'] + mission._set_path(full_path) + mission.file_name = full_path.name + mission.size = file_size + + if file_info['skip']: + mission._set_done('skipped', str(mission.path)) + return + + full_Path = Path(full_path) + if file_exists == 'add' and full_Path.exists(): + mission.data.offset = full_Path.stat().st_size + + # -------------------设置分块任务------------------- + first = False + if split and file_size and file_size > self.block_size and r.headers.get('Accept-Ranges') == 'bytes': + first = True + chunks = [[s, min(s + self.block_size, file_size) - 1] for s in range(0, file_size, self.block_size)] + chunks[-1][-1] = '' + chunks_len = len(chunks) + + task1 = Task(mission, chunks[0], f'1/{chunks_len}', chunks[0][1] - chunks[0][0]) + mission.tasks_count = chunks_len + mission.tasks = [task1] + + for ind, chunk in enumerate(chunks[1:], 2): + s = file_size - chunk[0] if chunks_len == ind else chunk[1] - chunk[0] + task = Task(mission, chunk, f'{ind}/{chunks_len}', s) + mission.tasks.append(task) + self._run_or_wait(task) + + else: # 不分块 + task1 = Task(mission, None, '1/1', file_size) + mission.tasks.append(task1) + + self._threads[thread_id]['mission'] = task1 + _do_download(r, task1, first) + + +def _do_download(r: Response, task: Task, first: bool = False): + """执行下载任务 + :param r: Response对象 + :param task: 任务 + :param first: 是否第一个分块 + :return: None + """ + if task.is_done or task.mission.is_done: + return + + task.set_states(result=None, info='下载中', state='running') + block_size = 131072 # 128k + result = None + + try: + if first: # 分块是第一块 + if task.range[1] <= block_size or task.range[1] % block_size != 0: + r_content = r.iter_content(chunk_size=task.range[1] + 1) + task.add_data(next(r_content), seek=0 + task.mission.data.offset) + if task.state in ('cancel', 'done'): + result = 'canceled' + task.clear_cache() + + else: + blocks = task.range[1] // block_size + remainder = task.range[1] % block_size + r_content = r.iter_content(chunk_size=block_size) + for b in range(blocks): + task.add_data(next(r_content), seek=b * block_size + task.mission.data.offset) + if task.state in ('cancel', 'done'): + result = 'canceled' + task.clear_cache() + break + + if task.state in ('cancel', 'done'): + result = 'canceled' + task.clear_cache() + else: + task.add_data(next(r_content)[:remainder + 1], blocks * block_size + task.mission.data.offset) + + else: + if task.range is None: # 不分块 + for chunk in r.iter_content(chunk_size=block_size): + if task.state in ('cancel', 'done'): + result = 'canceled' + task.clear_cache() + break + if chunk: + task.add_data(chunk, None) + + elif task.range[1] == '': # 结尾的数据块 + begin = task.range[0] + for chunk in r.iter_content(chunk_size=block_size): + if task.state in ('cancel', 'done'): + result = 'canceled' + task.clear_cache() + break + if chunk: + task.add_data(chunk, seek=begin + task.mission.data.offset) + begin += len(chunk) + + else: # 有始末数字的数据块 + begin, end = task.range + num = (end - begin) // block_size + for ind, chunk in enumerate(r.iter_content(chunk_size=block_size), 1): + if task.state in ('cancel', 'done'): + result = 'canceled' + task.clear_cache() + break + if chunk: + task.add_data(chunk, seek=begin + task.mission.data.offset) + if ind <= num: + begin += block_size + + except Exception as e: + result, info = False, f'下载失败。{r.status_code} {e}' + + else: + result = 'success' if result is None else result + info = str(task.path) + + finally: + r.close() + + task._set_done(result=result, info=info) diff --git a/src/flaresolverr/DownloadKit/downloadKit.pyi b/src/flaresolverr/DownloadKit/downloadKit.pyi new file mode 100644 index 0000000000..f93316fe97 --- /dev/null +++ b/src/flaresolverr/DownloadKit/downloadKit.pyi @@ -0,0 +1,180 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +""" +from pathlib import Path +from queue import Queue +from threading import Lock +from typing import Union, Tuple, Any, Literal, Optional + +from ..DataRecorder import Recorder +from ..DrissionPage._base.base import BasePage +from requests import Session, Response +from requests.structures import CaseInsensitiveDict + +from ._funcs import FileExistsSetter, PathSetter, BlockSizeSetter +from .mission import Task, Mission, BaseTask +from .setter import Setter + +FILE_EXISTS = Literal['add', 'skip', 'rename', 'overwrite', 'a', 's', 'r', 'o'] + + +class DownloadKit(object): + file_exists: FileExistsSetter = ... + goal_path: PathSetter = ... + block_size: BlockSizeSetter = ... + + _roads: int = ... + _setter: Optional[Setter] = ... + _print_mode: Optional[str] = ... + _log_mode: Optional[str] = ... + _logger: Optional[Recorder] = ... + _retry: Optional[int] = ... + _interval: Optional[float] = ... + page: Optional[BasePage] = ... + _waiting_list: Queue = ... + _session: Session = ... + _running_count: int = ... + _missions_num: int = ... + _missions: dict = ... + _threads: dict = ... + _timeout: Optional[int, float] = ... + _stop_printing: bool = ... + _lock: Lock = ... + split: bool = ... + _encoding: Optional[str] = ... + + def __init__(self, + goal_path: Union[str, Path] = None, + roads: int = 10, + driver: Union[Session, BasePage] = None, + file_exists: FILE_EXISTS = 'rename'): ... + + def __call__(self, + file_url: str, + goal_path: Optional[str, Path] = None, + rename: str = None, + suffix: Optional[str] = None, + file_exists: FILE_EXISTS = None, + show_msg: bool = True, + timeout: Optional[float] = None, + params: Optional[dict] = ..., + data: Any = ..., + json: Any = ..., + headers: Optional[dict] = ..., + cookies: Any = ..., + files: Any = ..., + auth: Any = ..., + allow_redirects: bool = ..., + proxies: Optional[dict] = ..., + hooks: Any = ..., + stream: Any = ..., + verify: Any = ..., + cert: Any = ...) -> tuple: ... + + @property + def set(self) -> Setter: ... + + @property + def roads(self) -> int: ... + + @property + def retry(self) -> int: ... + + @property + def interval(self) -> float: ... + + @property + def timeout(self) -> float: ... + + @property + def waiting_list(self) -> Queue: ... + + @property + def session(self) -> Session: ... + + @property + def is_running(self) -> bool: ... + + @property + def missions(self) -> dict: ... + + @property + def encoding(self) -> Optional[str]: ... + + def add(self, + file_url: str, + goal_path: Optional[str, Path] = None, + rename: str = None, + suffix: str = None, + file_exists: FILE_EXISTS = None, + split: bool = None, + timeout: Optional[float] = None, + params: Optional[dict] = ..., + data: Any = None, + json: Optional[dict, str] = ..., + headers: Optional[dict] = ..., + cookies: Any = ..., + files: Any = ..., + auth: Any = ..., + allow_redirects: bool = ..., + proxies: Optional[dict] = ..., + hooks: Any = ..., + stream: Any = ..., + verify: Any = ..., + cert: Any = ...) -> Mission: ... + + def download(self, + file_url: str, + goal_path: Optional[str, Path] = None, + rename: str = None, + suffix: str = None, + file_exists: FILE_EXISTS = None, + show_msg: bool = True, + timeout: Optional[float] = None, + params: Optional[dict] = ..., + data: Any = ..., + json: Any = ..., + headers: Optional[dict] = ..., + cookies: Any = ..., + files: Any = ..., + auth: Any = ..., + allow_redirects: bool = ..., + proxies: Optional[dict] = ..., + hooks: Any = ..., + stream: Any = ..., + verify: Any = ..., + cert: Any = ...) -> tuple: ... + + def _run_or_wait(self, mission: BaseTask) -> None: ... + + def _run(self, ID: int, mission: BaseTask) -> None: ... + + def get_mission(self, mission_or_id: Union[int, Mission]) -> Mission: ... + + def get_failed_missions(self) -> list: ... + + def wait(self, + mission: Union[int, Mission] = None, + show: bool = False, + timeout: float = None) -> Optional[tuple]: ... + + def cancel(self) -> None: ... + + def show(self, asyn: bool = True, keep: bool = False) -> None: ... + + def _show(self, wait: float, keep: bool = False) -> None: ... + + def _connect(self, url: str, session: Session, _headers: CaseInsensitiveDict, + method: str, encoding: Optional[str], **kwargs) -> Tuple[Union[Response, None], str]: ... + + def _get_usable_thread(self) -> Optional[int]: ... + + def _stop_show(self) -> None: ... + + def _when_mission_done(self, mission: Mission) -> None: ... + + def _download(self, + mission_or_task: Union[Mission, Task], + thread_id: int) -> None: ... diff --git a/src/flaresolverr/DownloadKit/mission.py b/src/flaresolverr/DownloadKit/mission.py new file mode 100644 index 0000000000..8bd5580371 --- /dev/null +++ b/src/flaresolverr/DownloadKit/mission.py @@ -0,0 +1,360 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +""" +from pathlib import Path +from time import sleep, perf_counter +from urllib.parse import quote, urlparse + +from ..DataRecorder import ByteRecorder +from requests.structures import CaseInsensitiveDict + +from ._funcs import copy_session, set_session_cookies + + +class MissionData(object): + def __init__(self, url, goal_path, rename, suffix, file_exists, split, kwargs, offset=0): + """保存任务数据的对象 + :param url: 下载文件url + :param goal_path: 保存文件夹 + :param rename: 文件重命名 + :param suffix: 文件重命名后缀名 + :param file_exists: 存在重名文件时处理方式 + :param split: 是否允许分块下载 + :param kwargs: requests其它参数 + :param offset: 文件存储偏移量 + """ + self.url = quote(url, safe='-_.~!*\'"();:@&=+$,/\\?#[]%') + self.goal_path = goal_path + self.rename = rename + self.suffix = suffix + self.file_exists = file_exists + self.split = split + self.kwargs = kwargs + self.offset = offset + + +class BaseTask(object): + _DONE = 'done' + RESULT_TEXTS = {'success': '成功', 'skipped': '跳过', 'canceled': '取消', False: '失败', None: '未知'} + + def __init__(self, ID): + """任务类基类 + :param ID: 任务id + """ + self._id = ID + self.state = 'waiting' # 'waiting'、'running'、'done' + self.result = None # 'success'、'skipped'、'canceled'、False、None + self.info = '等待下载' # 信息 + + @property + def id(self): + """返回任务或子任务id""" + return self._id + + @property + def data(self): + """返回任务数据""" + return + + @property + def is_done(self): + """返回任务是否结束""" + return self.state in ('done', 'cancel') + + def set_states(self, result=None, info=None, state='done'): + """设置任务结果值 + :param result: 结果:'success'、'skipped'、'canceled'、False、None + :param info: 任务信息 + :param state: 任务状态:'waiting'、'running'、'done' + :return: None + """ + self.result = result + self.info = info + self.state = state + + +class Mission(BaseTask): + def __init__(self, ID, download_kit, file_url, goal_path, rename, suffix, file_exists, split, encoding, kwargs): + """任务类 + :param ID: 任务id + :param download_kit: 所属DownloadKit对象 + :param file_url: 文件网址 + :param goal_path: 保存文件夹路径 + :param rename: 重命名 + :param suffix: 重命名后缀名 + :param file_exists: 存在同名文件处理方式 + :param split: 是否分块下载 + :param encoding: 编码格式 + :param kwargs: 连接参数 + """ + super().__init__(ID) + self.download_kit = download_kit + self.size = None + + self.tasks = [] + self.tasks_count = 1 + self.done_tasks_count = 0 + + self.file_name = None + self._path = None # 文件完整路径,Path对象 + self._recorder = None + self.encoding = encoding + + self._set_session() + kwargs = self._handle_kwargs(file_url, kwargs) + self._data = MissionData(file_url, goal_path, rename, suffix, file_exists, split, kwargs) + self.method = 'post' if (self._data.kwargs.get('data', None) is not None or + self._data.kwargs.get('json', None) is not None) else 'get' + + def __repr__(self): + return f'' + + @property + def data(self): + """返回任务数据""" + return self._data + + @property + def path(self): + """返回文件保存路径""" + return self._path + + @property + def recorder(self): + """返回记录器对象""" + if self._recorder is None: + self._recorder = ByteRecorder(cache_size=100) + self._recorder.show_msg = False + return self._recorder + + @property + def rate(self): + """返回下载进度百分比""" + if not self.size: + return None + c = 0 + for t in self.tasks: + c += t._downloaded_size if t._downloaded_size else 0 + return round((c / self.size) * 100, 2) + + def cancel(self) -> None: + """取消该任务,停止未下载完的task""" + self._break_mission('canceled', '已取消') + + def del_file(self): + """删除下载的文件""" + if self.path and self.path.exists(): + try: + self.path.unlink() + except Exception: + pass + + def wait(self, show=True, timeout=0): + """等待当前任务完成 + :param show: 是否显示下载进度 + :param timeout: 超时时间 + :return: 任务结果和信息组成的tuple + """ + if show: + print(f'url:{self.data.url}') + t2 = perf_counter() + while self.file_name is None and perf_counter() - t2 < 4: + sleep(0.01) + print(f'文件名:{self.file_name}') + print(f'目标路径:{self.path}') + if not self.size: + print('未知大小 ', end='') + + t1 = perf_counter() + while not self.is_done and (perf_counter() - t1 < timeout or timeout == 0): + if show and self.size: + try: + rate = round((self.path.stat().st_size / self.size) * 100, 2) + print(f'\r{rate}% ', end='') + except FileNotFoundError: + pass + + sleep(0.1) + + if show: + if self.result is False: + print(f'下载失败 {self.info}') + elif self.result == 'success': + print('\r100% ', end='') + print(f'下载完成 {self.info}') + elif self.result == 'skipped': + print(f'已跳过 {self.info}') + print() + + return self.result, self.info + + def _set_session(self): + """复制Session对象,并设置cookies""" + session = copy_session(self.download_kit.session) + headers = session.headers + session.headers = None + if self.download_kit.page: + set_session_cookies(session, self.download_kit.page.cookies()) + if hasattr(self.download_kit.page, '_headers'): + headers = CaseInsensitiveDict({**self.download_kit.page._headers, **headers}) + headers.update({"User-Agent": self.download_kit.page.user_agent}) + self.session = session + self.headers = headers + + def _handle_kwargs(self, url, kwargs): + """处理接收到的参数 + :param url: 要访问的url + :param kwargs: 传入的参数dict + :return: 处理后的参数dict + """ + if 'timeout' not in kwargs: + kwargs['timeout'] = self.download_kit.timeout + + headers = CaseInsensitiveDict(kwargs['headers']) if 'headers' in kwargs else CaseInsensitiveDict() + + parsed_url = urlparse(url) + hostname = parsed_url.hostname + scheme = parsed_url.scheme + + if not ('Referer' in headers or 'Referer' in self.headers): + headers['Referer'] = self.download_kit.page.url if self.download_kit.page and self.download_kit.page.url \ + else f'{scheme}://{hostname}' + if not ('Host' in headers or 'Host' in self.headers): + headers['Host'] = hostname + kwargs['headers'] = headers + + return kwargs + + def _set_path(self, path): + """设置文件保存路径""" + if isinstance(path, (Path, str)): + path = Path(path) + self.file_name = path.name + + self._path = path + self.recorder.set.path(path) + + def _set_done(self, result, info): + """设置一个任务为done状态 + :param result: 结果:'success'、'skipped'、'canceled'、False、None + :param info: 任务信息 + :return: None + """ + if result == 'skipped': + self.set_states(result=result, info=info, state=self._DONE) + + elif result == 'canceled' or result is False: + self.recorder.clear() + self.set_states(result=result, info=info, state=self._DONE) + + elif result == 'success': + self.recorder.record() + if self.size and self.path.stat().st_size < self.size: + self.del_file() + self.set_states(False, '下载失败', self._DONE) + else: + self.set_states('success', info, self._DONE) + + self.download_kit._when_mission_done(self) + + def _a_task_done(self, is_success, info): + """当一个task完成时调用 + :param is_success: 该task是否成功 + :param info: 该task传入的信息 + :return: None + """ + if self.is_done: + return + + if is_success is False: + self._break_mission(False, info) + return + + self.done_tasks_count += 1 + if self.done_tasks_count == self.tasks_count: + self._set_done('success', info) + + def _break_mission(self, result, info): + """中止该任务,停止未下载完的task + :param result: 结果:'success'、'skipped'、'canceled'、False、None + :param info: 任务信息 + :return: None + """ + if self.is_done: + return + + for task in self.tasks: + if not task.is_done: + task.set_states(result=result, info=info, state='cancel') + + while any((not i.is_done for i in self.tasks)): + sleep(.3) + + self._set_done(result, info) + self.del_file() + + +class Task(BaseTask): + def __init__(self, mission, range_, ID, size): + """子任务类 + :param mission: 父任务对象 + :param range_: 读取文件数据范围 + :param ID: 任务id + """ + super().__init__(ID) + self.mission = mission + self.range = range_ + self.size = size + self._downloaded_size = 0 + + def __repr__(self): + return f'' + + @property + def mid(self): + """返回父任务id""" + return self.mission.id + + @property + def data(self): + """返回任务数据对象""" + return self.mission.data + + @property + def path(self): + """返回文件保存路径""" + return self.mission.path + + @property + def file_name(self): + """返回文件名""" + return self.mission.file_name + + @property + def rate(self): + """返回下载进度百分比""" + return round((self._downloaded_size / self.size) * 100, 2) if self.size else None + + def add_data(self, data, seek=None): + """把数据输入到记录器 + :param data: 文件字节数据 + :param seek: 在文件中的位置,None表示最后 + :return: None + """ + self._downloaded_size += len(data) + self.mission.recorder.add_data(data, seek) + + def clear_cache(self): + """清除以接收但未写入硬盘的缓存""" + self.mission.recorder.clear() + + def _set_done(self, result, info): + """设置一个子任务为done状态 + :param result: 结果:'success'、'skipped'、'canceled'、False、None + :param info: 任务信息 + :return: None + """ + self.set_states(result=result, info=info, state=self._DONE) + self.mission._a_task_done(result, info) diff --git a/src/flaresolverr/DownloadKit/mission.pyi b/src/flaresolverr/DownloadKit/mission.pyi new file mode 100644 index 0000000000..551defc313 --- /dev/null +++ b/src/flaresolverr/DownloadKit/mission.pyi @@ -0,0 +1,136 @@ +# -*- coding:utf-8 -*- +from pathlib import Path +from typing import Union, List, Optional + +from ..DataRecorder import ByteRecorder +from requests import Session +from requests.structures import CaseInsensitiveDict + +from .downloadKit import DownloadKit + + +class MissionData(object): + """保存任务数据的对象""" + url: str = ... + goal_path: Union[str, Path] = ... + rename: Optional[str] = ... + suffix: Optional[str] = ... + file_exists: str = ... + split: bool = ... + kwargs: dict = ... + offset: int = ... + + def __init__(self, url: str, goal_path: Union[str, Path], rename: Optional[str], suffix: Optional[str], + file_exists: str, split: bool, kwargs: dict, offset: int = 0): ... + + +class BaseTask(object): + """任务类基类""" + _DONE: str = ... + RESULT_TEXTS: dict = ... + + _id: str = ... + state: str = ... + result: Optional[str, False] = ... + info: str = ... + + def __init__(self, ID: Union[int, str]): ... + + @property + def id(self) -> Union[int, str]: ... + + @property + def data(self): ... + + @property + def is_done(self) -> bool: ... + + def set_states(self, + result: Optional[bool, str] = None, + info: str = None, + state: str = 'done') -> None: ... + + +class Mission(BaseTask): + """任务类""" + file_name: Optional[str] = ... + _data: MissionData = ... + _path: Optional[str, Path] = ... + _recorder: Optional[ByteRecorder] = ... + size: Optional[float] = ... + done_tasks_count: int = ... + tasks_count: int = ... + tasks: List[Task] = ... + download_kit: DownloadKit = ... + session: Session = ... + headers: CaseInsensitiveDict = ... + method: str = ... + encoding: Optional[str] = ... + + def __init__(self, ID: int, download_kit: DownloadKit, file_url: str, goal_path: Union[str, Path], rename: str, + suffix: str, file_exists: str, split: bool, encoding: Optional[str], kwargs: dict): ... + + def __repr__(self) -> str: ... + + def _set_session(self) -> Session: ... + + def _handle_kwargs(self, url: str, kwargs: dict) -> dict: ... + + @property + def data(self) -> MissionData: ... + + @property + def path(self) -> Union[str, Path]: ... + + def _set_path(self, path: Optional[str, Path]) -> None: ... + + @property + def recorder(self) -> ByteRecorder: ... + + @property + def rate(self) -> Optional[float]: ... + + def _set_done(self, result: Optional[bool, str], info: str) -> None: ... + + def _a_task_done(self, is_success: bool, info: str) -> None: ... + + def _break_mission(self, result: Optional[bool, str], info: str) -> None: ... + + def cancel(self) -> None: ... + + def del_file(self): ... + + def wait(self, show: bool = True, timeout: float = 0) -> tuple: ... + + +class Task(BaseTask): + """子任务类""" + mission: Mission = ... + range: Optional[list] = ... + size: Optional[int] = ... + _downloaded_size: int = 0 + + def __init__(self, mission: Mission, range_: Optional[list], ID: str, size: Optional[int]): ... + + def __repr__(self) -> str: ... + + @property + def mid(self) -> int: ... + + @property + def data(self) -> MissionData: ... + + @property + def path(self) -> str: ... + + @property + def file_name(self) -> str: ... + + @property + def rate(self) -> Optional[float]: ... + + def add_data(self, data: bytes, seek: int = None) -> None: ... + + def clear_cache(self) -> None: ... + + def _set_done(self, result: Optional[bool, str], info: str) -> None: ... diff --git a/src/flaresolverr/DownloadKit/setter.py b/src/flaresolverr/DownloadKit/setter.py new file mode 100644 index 0000000000..5ffea9103b --- /dev/null +++ b/src/flaresolverr/DownloadKit/setter.py @@ -0,0 +1,206 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@File : setter.py +""" +from ..DataRecorder import Recorder +from requests import Session + +from ._funcs import get_file_exists_mode + + +class Setter(object): + def __init__(self, downloadKit): + """ + :param downloadKit: downloadKit对象 + """ + self._downloadKit = downloadKit + + @property + def if_file_exists(self): + """返回用于设置文件同名策略的对象""" + return FileExists(self) + + @property + def log(self): + """返回用于设置记录模式的对象""" + return LogSet(self) + + def driver(self, driver): + """设置Session对象 + :param driver: Session对象或DrissionPage的页面对象 + :return: None + """ + if driver is None: + self._downloadKit._session = Session() + return + + elif isinstance(driver, Session): + self._downloadKit._session = driver + return + + _type = str(type(driver)) + if _type.startswith(' FileExists: ... + + @property + def log(self) -> LogSet: ... + + def driver(self, driver: Union[Session, BasePage, SessionOptions]) -> None: ... + + def roads(self, num: int) -> None: ... + + def retry(self, times: int) -> None: ... + + def interval(self, seconds: float) -> None: ... + + def timeout(self, seconds: float) -> None: ... + + def goal_path(self, path: Union[str, Path]) -> None: ... + + def split(self, on_off: bool) -> None: ... + + def block_size(self, size: Union[str, int]) -> None: ... + + def proxies(self, http: str = None, https: str = None) -> None: ... + + def encoding(self, encoding: Optional[str]) -> None: ... + + +class LogSet(object): + _setter: Setter = ... + + def __init__(self, setter: Setter): ... + + def path(self, path: Union[str, Path]) -> None: ... + + def print_all(self) -> None: ... + + def print_failed(self) -> None: ... + + def print_nothing(self) -> None: ... + + def log_all(self) -> None: ... + + def log_failed(self) -> None: ... + + def log_nothing(self) -> None: ... + + +class FileExists(object): + _setter: Setter = ... + + def __init__(self, setter: Setter): ... + + def __call__(self, mode: FILE_EXISTS): ... + + def skip(self) -> None: ... + + def rename(self) -> None: ... + + def overwrite(self) -> None: ... diff --git a/src/flaresolverr/DrissionPage/__init__.py b/src/flaresolverr/DrissionPage/__init__.py new file mode 100644 index 0000000000..39c0eb0f78 --- /dev/null +++ b/src/flaresolverr/DrissionPage/__init__.py @@ -0,0 +1,17 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from ._base.browser import Chromium +from ._configs.chromium_options import ChromiumOptions +from ._configs.session_options import SessionOptions +from ._pages.session_page import SessionPage + +from ._pages.chromium_page import ChromiumPage +from ._pages.mix_page import MixPage +from ._pages.mix_page import MixPage as WebPage + +__version__ = '4.1.0.0b19' diff --git a/src/flaresolverr/DrissionPage/__init__.pyi b/src/flaresolverr/DrissionPage/__init__.pyi new file mode 100644 index 0000000000..79a75ce6e1 --- /dev/null +++ b/src/flaresolverr/DrissionPage/__init__.pyi @@ -0,0 +1,18 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from ._base.browser import Chromium +from ._configs.chromium_options import ChromiumOptions +from ._configs.session_options import SessionOptions +from ._pages.session_page import SessionPage + +from ._pages.chromium_page import ChromiumPage +from ._pages.mix_page import MixPage +from ._pages.mix_page import MixPage as WebPage + +__all__ = ['MixPage', 'WebPage', 'ChromiumPage', 'Chromium', 'ChromiumOptions', 'SessionOptions', 'SessionPage', '__version__'] +__version__: str = ... diff --git a/src/flaresolverr/DrissionPage/_base/base.py b/src/flaresolverr/DrissionPage/_base/base.py new file mode 100644 index 0000000000..9ff74df75c --- /dev/null +++ b/src/flaresolverr/DrissionPage/_base/base.py @@ -0,0 +1,445 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from abc import abstractmethod +from pathlib import Path +from re import sub +from urllib.parse import quote + +from ...DownloadKit import DownloadKit + +from .._elements.none_element import NoneElement +from .._functions.elements import get_frame +from .._functions.locator import get_loc +from .._functions.settings import Settings +from .._functions.web import format_html +from ..errors import ElementNotFoundError + + +class BaseParser(object): + """所有页面、元素类的基类""" + + def __call__(self, locator): + return self.ele(locator) + + def ele(self, locator, index=1, timeout=None): + return self._ele(locator, timeout, index=index, method='ele()') + + def eles(self, locator, timeout=None): + return self._ele(locator, timeout, index=None) + + # ----------------以下属性或方法待后代实现---------------- + @property + def html(self): + return '' + + def s_ele(self, locator=None): + pass + + def s_eles(self, locator): + pass + + def _ele(self, locator, timeout=None, index=1, raise_err=None, method=None): + pass + + def _find_elements(self, locator, timeout=None, index=1, relative=False, raise_err=None): + pass + + +class BaseElement(BaseParser): + """各元素类的基类""" + + def __init__(self, owner=None): + self.owner = owner + self._type = 'BaseElement' + + # ----------------以下属性或方法由后代实现---------------- + @property + def tag(self): + return + + def parent(self, level_or_loc=1): + pass + + def next(self, index=1): + pass + + def nexts(self): + pass + + def get_frame(self, loc_or_ind, timeout=None): + """获取元素中一个frame对象 + :param loc_or_ind: 定位符、iframe序号,序号从1开始,可传入负数获取倒数第几个 + :param timeout: 查找元素超时时间(秒) + :return: ChromiumFrame对象 + """ + if not isinstance(loc_or_ind, (int, str, tuple)): + raise TypeError('loc_or_ind参数是定位符或序号。') + return get_frame(self, loc_ind_ele=loc_or_ind, timeout=timeout) + + def _ele(self, locator, timeout=None, index=1, relative=False, raise_err=None, method=None): + """调用获取元素的方法 + :param locator: 定位符 + :param timeout: 超时时间(秒) + :param index: 获取第几个,从1开始,可传入负数获取倒数第几个 + :param relative: 是否相对定位 + :param raise_err: 找不到时是否抛出异常 + :param method: 调用的方法名 + :return: 元素对象或它们组成的列表 + """ + if hasattr(locator, '_type'): + return locator + r = self._find_elements(locator, timeout=timeout, index=index, relative=relative, raise_err=raise_err) + if r or isinstance(r, list): + return r + if Settings.raise_when_ele_not_found or raise_err is True: + raise ElementNotFoundError(None, method, {'locator': locator, 'index': index}) + + r.method = method + r.args = {'locator': locator, 'index': index} + return r + + +class DrissionElement(BaseElement): + """ChromiumElement 和 SessionElement的基类,但不是ShadowRoot的基类""" + + @property + def link(self): + """返回href或src绝对url""" + return self.attr('href') or self.attr('src') + + @property + def css_path(self): + """返回css path路径""" + return self._get_ele_path('css') + + @property + def xpath(self): + """返回xpath路径""" + return self._get_ele_path('xpath') + + @property + def comments(self): + """返回元素注释文本组成的列表""" + return self.eles('xpath:.//comment()') + + def texts(self, text_node_only=False): + """返回元素内所有直接子节点的文本,包括元素和文本节点 + :param text_node_only: 是否只返回文本节点 + :return: 文本列表 + """ + texts = self.eles('xpath:/text()') if text_node_only else [x if isinstance(x, str) else x.text + for x in self.eles('xpath:./text() | *')] + return [format_html(x.strip(' ').rstrip('\n')) for x in texts if x and sub('[\r\n\t ]', '', x) != ''] + + def parent(self, level_or_loc=1, index=1, timeout=None): + """返回上面某一级父元素,可指定层数或用查询语法定位 + :param level_or_loc: 第几级父元素,1开始,或定位符 + :param index: 当level_or_loc传入定位符,使用此参数选择第几个结果,1开始 + :param timeout: 时间(秒) + :return: 上级元素对象 + """ + if isinstance(level_or_loc, int): + loc = f'xpath:./ancestor::*[{level_or_loc}]' + + elif isinstance(level_or_loc, (tuple, str)): + loc = get_loc(level_or_loc, True) + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + loc = f'xpath:./ancestor::{loc[1].lstrip(". / ")}[{index}]' + + else: + raise TypeError('level_or_loc参数只能是tuple、int或str。') + + return self._ele(loc, timeout=timeout, relative=True, raise_err=False, method='parent()') + + def child(self, locator='', index=1, timeout=None, ele_only=True): + """返回直接子元素元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 直接子元素或节点文本组成的列表 + """ + if isinstance(locator, int): + index = locator + locator = '' + if not locator: + loc = '*' if ele_only else 'node()' + else: + loc = get_loc(locator, True) # 把定位符转换为xpath + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + loc = loc[1].lstrip('./') + + node = self._ele(f'xpath:./{loc}', timeout=timeout, index=index, relative=True, raise_err=False) + return node if node else NoneElement(self.owner, 'child()', + {'locator': locator, 'index': index, 'ele_only': ele_only}) + + def prev(self, locator='', index=1, timeout=None, ele_only=True): + """返回前面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素 + """ + return self._get_relative('prev()', 'preceding', True, locator, index, timeout, ele_only) + + def next(self, locator='', index=1, timeout=None, ele_only=True): + """返回后面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 后面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素 + """ + return self._get_relative('next()', 'following', True, locator, index, timeout, ele_only) + + def before(self, locator='', index=1, timeout=None, ele_only=True): + """返回前面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的某个元素或节点 + """ + return self._get_relative('before()', 'preceding', False, locator, index, timeout, ele_only) + + def after(self, locator='', index=1, timeout=None, ele_only=True): + """返回后面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 后面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素后面的某个元素或节点 + """ + return self._get_relative('after()', 'following', False, locator, index, timeout, ele_only) + + def children(self, locator='', timeout=None, ele_only=True): + """返回直接子元素元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 直接子元素或节点文本组成的列表 + """ + if not locator: + loc = '*' if ele_only else 'node()' + else: + loc = get_loc(locator, True) # 把定位符转换为xpath + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + loc = loc[1].lstrip('./') + + loc = f'xpath:./{loc}' + nodes = self._ele(loc, timeout=timeout, index=None, relative=True) + return [e for e in nodes if not (isinstance(e, str) and sub('[ \n\t\r]', '', e) == '')] + + def prevs(self, locator='', timeout=None, ele_only=True): + """返回前面全部兄弟元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素或节点文本组成的列表 + """ + return self._get_relatives(locator=locator, direction='preceding', timeout=timeout, ele_only=ele_only) + + def nexts(self, locator='', timeout=None, ele_only=True): + """返回后面全部兄弟元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素或节点文本组成的列表 + """ + return self._get_relatives(locator=locator, direction='following', timeout=timeout, ele_only=ele_only) + + def befores(self, locator='', timeout=None, ele_only=True): + """返回后面全部兄弟元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的元素或节点组成的列表 + """ + return self._get_relatives(locator=locator, direction='preceding', + brother=False, timeout=timeout, ele_only=ele_only) + + def afters(self, locator='', timeout=None, ele_only=True): + """返回前面全部兄弟元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素后面的元素或节点组成的列表 + """ + return self._get_relatives(locator=locator, direction='following', + brother=False, timeout=timeout, ele_only=ele_only) + + def _get_relative(self, func, direction, brother, locator='', index=1, timeout=None, ele_only=True): + """获取一个亲戚元素或节点,可用查询语法筛选,可指定返回筛选结果的第几个 + :param func: 方法名称 + :param direction: 方向,'following' 或 'preceding' + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的某个元素或节点 + """ + if isinstance(locator, int): + index = locator + locator = '' + node = self._get_relatives(index, locator, direction, brother, timeout, ele_only) + return node if node else NoneElement(self.owner, func, + {'locator': locator, 'index': index, 'ele_only': ele_only}) + + def _get_relatives(self, index=None, locator='', direction='following', brother=True, timeout=.5, ele_only=True): + """按要求返回兄弟元素或节点组成的列表 + :param index: 获取第几个,该参数不为None时只获取该编号的元素 + :param locator: 用于筛选的查询语法 + :param direction: 'following' 或 'preceding',查找的方向 + :param brother: 查找范围,在同级查找还是整个dom前后查找 + :param timeout: 查找等待时间(秒) + :return: 元素对象或字符串 + """ + brother = '-sibling' if brother else '' + + if not locator: + loc = '*' if ele_only else 'node()' + + else: + loc = get_loc(locator, True) # 把定位符转换为xpath + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + loc = loc[1].lstrip('./') + + loc = f'xpath:./{direction}{brother}::{loc}' + + if index is not None: + index = index if direction == 'following' else -index + nodes = self._ele(loc, timeout=timeout, index=index, relative=True, raise_err=False) + if isinstance(nodes, list): + nodes = [e for e in nodes if not (isinstance(e, str) and sub('[ \n\t\r]', '', e) == '')] + return nodes + + # ----------------以下属性或方法由后代实现---------------- + @property + def attrs(self): + return + + @property + def text(self): + return + + @property + def raw_text(self): + return + + @abstractmethod + def attr(self, name: str): + return '' + + def _get_ele_path(self, mode): + return '' + + def _find_elements(self, locator, timeout=None, index=1, relative=False, raise_err=None): + pass + + +class BasePage(BaseParser): + """页面类的基类""" + + def __init__(self): + """初始化函数""" + self._url = None + self._url_available = None + self.retry_times = 3 + self.retry_interval = 2 + self._DownloadKit = None + self._download_path = None + self._none_ele_return_value = False + self._none_ele_value = None + self._type = 'BasePage' + + @property + def title(self): + """返回网页title""" + ele = self._ele('xpath://title', raise_err=False, method='title') + return ele.text if ele else None + + @property + def url_available(self): + """返回当前访问的url有效性""" + return self._url_available + + @property + def download_path(self): + """返回默认下载路径""" + return self._download_path + + @property + def download(self): + """返回下载器对象""" + if self._DownloadKit is None: + self._DownloadKit = DownloadKit(driver=self, goal_path=self.download_path) + return self._DownloadKit + + def _before_connect(self, url, retry, interval): + """连接前的准备 + :param url: 要访问的url + :param retry: 重试次数 + :param interval: 重试间隔 + :return: 重试次数、间隔、是否文件组成的tuple + """ + is_file = False + if isinstance(url, Path) or ('://' not in url and ':\\\\' not in url): + p = Path(url) + if p.exists(): + url = str(p.absolute()) + is_file = True + + self._url = url if is_file else quote(url, safe='-_.~!*\'"();:@&=+$,/\\?#[]%') + retry = retry if retry is not None else self.retry_times + interval = interval if interval is not None else self.retry_interval + return retry, interval, is_file + + # ----------------以下属性或方法由后代实现---------------- + @property + def url(self): + return + + @property + def json(self): + return + + @property + def user_agent(self): + return + + @abstractmethod + def get(self, url, show_errmsg=False, retry=None, interval=None): + pass + + def _ele(self, locator, timeout=None, index=1, raise_err=None, method=None): + """调用获取元素的方法 + :param locator: 定位符 + :param timeout: 超时时间(秒) + :param index: 获取第几个,从1开始,可传入负数获取倒数第几个 + :param raise_err: 找不到时是否抛出异常 + :param method: 调用的方法名 + :return: 元素对象或它们组成的列表 + """ + if not locator: + raise ElementNotFoundError(None, method, {'locator': locator}) + + r = self._find_elements(locator, timeout=timeout, index=index, raise_err=raise_err) + + if r or isinstance(r, list): + return r + if Settings.raise_when_ele_not_found or raise_err is True: + raise ElementNotFoundError(None, method, {'locator': locator, 'index': index}) + + r.method = method + r.args = {'locator': locator, 'index': index} + return r diff --git a/src/flaresolverr/DrissionPage/_base/base.pyi b/src/flaresolverr/DrissionPage/_base/base.pyi new file mode 100644 index 0000000000..5550a5db24 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_base/base.pyi @@ -0,0 +1,244 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from abc import abstractmethod +from typing import Union, Tuple, List, Any, Optional + +from ...DownloadKit import DownloadKit + +from .._elements.none_element import NoneElement +from .._elements.session_element import SessionElement +from .._functions.elements import SessionElementsList +from .._pages.chromium_frame import ChromiumFrame +from .._pages.chromium_page import ChromiumPage +from .._pages.mix_page import MixPage +from .._pages.session_page import SessionPage + + +class BaseParser(object): + _type: str + + def __call__(self, locator: Union[Tuple[str, str], str], index: int = 1): ... + + def ele(self, + locator: Union[Tuple[str, str], str, BaseElement], + index: int = 1, + timeout: float = None): ... + + def eles(self, locator: Union[Tuple[str, str], str], timeout=None): ... + + # ----------------以下属性或方法待后代实现---------------- + @property + def html(self) -> str: ... + + def s_ele(self, + locator: Union[Tuple[str, str], str, BaseElement, None] = None, + index: int = 1) -> SessionElement: ... + + def s_eles(self, locator: Union[Tuple[str, str], str]) -> SessionElementsList: ... + + def _ele(self, + locator: Union[Tuple[str, str], str], + timeout: float = None, + index: Optional[int] = 1, + raise_err: bool = None, + method: str = None): ... + + def _find_elements(self, + locator: Union[Tuple[str, str], str], + timeout: float = None, + index: Optional[int] = 1, + relative: bool = False, + raise_err: bool = None): ... + + +class BaseElement(BaseParser): + + def __init__(self, owner: BasePage = None): + self.owner: BasePage = ... + + # ----------------以下属性或方法由后代实现---------------- + @property + def tag(self) -> str: ... + + def _ele(self, + locator: Union[Tuple[str, str], str], + timeout: float = None, + index: Optional[int] = 1, + relative: bool = False, + raise_err: bool = None, + method: str = None): ... + + def parent(self, level_or_loc: Union[tuple, str, int] = 1): ... + + def prev(self, index: int = 1) -> None: ... + + def prevs(self) -> None: ... + + def next(self, index: int = 1): ... + + def nexts(self): ... + + def get_frame(self, loc_or_ind, timeout=None) -> ChromiumFrame: ... + + +class DrissionElement(BaseElement): + + def __init__(self, owner: BasePage = None): ... + + @property + def link(self) -> str: ... + + @property + def css_path(self) -> str: ... + + @property + def xpath(self) -> str: ... + + @property + def comments(self) -> list: ... + + def texts(self, text_node_only: bool = False) -> list: ... + + def parent(self, + level_or_loc: Union[tuple, str, int] = 1, + index: int = 1, + timeout: float = None) -> Union[DrissionElement, None]: ... + + def child(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[DrissionElement, str, NoneElement]: ... + + def prev(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[DrissionElement, str, NoneElement]: ... + + def next(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[DrissionElement, str, NoneElement]: ... + + def before(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[DrissionElement, str, NoneElement]: ... + + def after(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[DrissionElement, str, NoneElement]: ... + + def children(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> List[Union[DrissionElement, str]]: ... + + def prevs(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> List[Union[DrissionElement, str]]: ... + + def nexts(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> List[Union[DrissionElement, str]]: ... + + def befores(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> List[Union[DrissionElement, str]]: ... + + def afters(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> List[Union[DrissionElement, str]]: ... + + def _get_relative(self, + func: str, + direction: str, + brother: bool, + locator: Union[Tuple[str, str], str] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> DrissionElement: ... + + def _get_relatives(self, + index: int = None, + locator: Union[Tuple[str, str], str] = '', + direction: str = 'following', + brother: bool = True, + timeout: float = 0.5, + ele_only: bool = True) -> List[Union[DrissionElement, str]]: ... + + # ----------------以下属性或方法由后代实现---------------- + @property + def attrs(self) -> dict: ... + + @property + def text(self) -> str: ... + + @property + def raw_text(self) -> str: ... + + @abstractmethod + def attr(self, name: str) -> str: ... + + def _get_ele_path(self, mode) -> str: ... + + +class BasePage(BaseParser): + + def __init__(self): + self._url_available: bool = ... + self.retry_times: int = ... + self.retry_interval: float = ... + self._download_path: str = ... + self._DownloadKit: DownloadKit = ... + self._none_ele_return_value: bool = ... + self._none_ele_value: Any = ... + self._page: Union[ChromiumPage, SessionPage, MixPage] = ... + + @property + def title(self) -> Union[str, None]: ... + + @property + def url_available(self) -> bool: ... + + @property + def download_path(self) -> str: ... + + @property + def download(self) -> DownloadKit: ... + + def _before_connect(self, url: str, retry: int, interval: float) -> tuple: ... + + # ----------------以下属性或方法由后代实现---------------- + @property + def url(self) -> str: ... + + @property + def json(self) -> dict: ... + + @property + def user_agent(self) -> str: ... + + @abstractmethod + def get(self, url: str, show_errmsg: bool = False, retry: int = None, interval: float = None): ... + + def _ele(self, + locator, + timeout: float = None, + index: Optional[int] = 1, + raise_err: bool = None, + method: str = None): ... diff --git a/src/flaresolverr/DrissionPage/_base/browser.py b/src/flaresolverr/DrissionPage/_base/browser.py new file mode 100644 index 0000000000..a088097963 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_base/browser.py @@ -0,0 +1,577 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from pathlib import Path +from shutil import rmtree +from threading import Lock +from time import sleep, perf_counter + +from requests import Session +from websocket import WebSocketBadStatusException + +from .driver import BrowserDriver, Driver +from .._configs.chromium_options import ChromiumOptions +from .._configs.session_options import SessionOptions +from .._functions.browser import connect_browser +from .._functions.cookies import CookiesList +from .._functions.settings import Settings +from .._functions.tools import PortFinder +from .._functions.tools import raise_error +from .._pages.chromium_base import Timeout +from .._pages.tabs import ChromiumTab, MixTab +from .._units.downloader import DownloadManager +from .._units.setter import BrowserSetter +from .._units.waiter import BrowserWaiter +from ..errors import BrowserConnectError, CDPError +from ..errors import PageDisconnectedError + +__ERROR__ = 'error' + + +class Chromium(object): + _BROWSERS = {} + _lock = Lock() + + def __new__(cls, addr_or_opts=None, session_options=None): + """ + :param addr_or_opts: 浏览器地址:端口、ChromiumOptions对象或端口数字(int) + :param session_options: 使用双模Tab时使用的默认Session配置,为True使用ini文件配置 + """ + opt = handle_options(addr_or_opts) + is_headless, browser_id, is_exists = run_browser(opt) + with cls._lock: + if browser_id in cls._BROWSERS: + r = cls._BROWSERS[browser_id] + while not hasattr(r, '_driver'): + sleep(.1) + return r + r = object.__new__(cls) + r._chromium_options = opt + r.is_headless = is_headless + r._is_exists = is_exists + r.id = browser_id + cls._BROWSERS[browser_id] = r + return r + + def __init__(self, addr_or_opts=None, session_options=None): + """ + :param addr_or_opts: 浏览器地址:端口、ChromiumOptions对象或端口数字(int) + :param session_options: 使用双模Tab时使用的默认Session配置,为True使用ini文件配置 + """ + if hasattr(self, '_created'): + return + self._created = True + + self._type = 'Chromium' + self._frames = {} + self._drivers = {} + self._all_drivers = {} + + self._set = None + self._wait = None + self._timeouts = Timeout(**self._chromium_options.timeouts) + self._load_mode = self._chromium_options.load_mode + self._download_path = str(Path(self._chromium_options.download_path).absolute()) + self.retry_times = self._chromium_options.retry_times + self.retry_interval = self._chromium_options.retry_interval + self.address = self._chromium_options.address + self._driver = BrowserDriver(self.id, 'browser', self.address, self) + + if (not self._chromium_options._ua_set and self.is_headless != self._chromium_options.is_headless) or ( + self._is_exists and self._chromium_options._new_env): + self.quit(3, True) + connect_browser(self._chromium_options) + s = Session() + s.trust_env = False + s.keep_alive = False + ws = s.get(f'http://{self._chromium_options.address}/json/version', headers={'Connection': 'close'}) + self.id = ws.json()['webSocketDebuggerUrl'].split('/')[-1] + self._driver = BrowserDriver(self.id, 'browser', self.address, self) + ws.close() + s.close() + self._frames = {} + self._drivers = {} + self._all_drivers = {} + + self.version = self._run_cdp('Browser.getVersion')['product'] + + self._process_id = None + try: + r = self._run_cdp('SystemInfo.getProcessInfo') + for i in r.get('processInfo', []): + if i['type'] == 'browser': + self._process_id = i['id'] + break + except: + pass + + self._run_cdp('Target.setDiscoverTargets', discover=True) + self._driver.set_callback('Target.targetDestroyed', self._onTargetDestroyed) + self._driver.set_callback('Target.targetCreated', self._onTargetCreated) + self._dl_mgr = DownloadManager(self) + + self._session_options = SessionOptions() if session_options is True else session_options + + @property + def user_data_path(self): + """返回用户文件夹路径""" + return self._chromium_options.user_data_path + + @property + def process_id(self): + """返回浏览器进程id""" + return self._process_id + + @property + def timeout(self): + """返回timeouts设置""" + return self._timeouts.base + + @property + def timeouts(self): + """返回timeouts设置""" + return self._timeouts + + @property + def load_mode(self): + """返回加载模式""" + return self._load_mode + + @property + def download_path(self): + """返回默认下载路径""" + return self._download_path + + @property + def set(self): + if self._set is None: + self._set = BrowserSetter(self) + return self._set + + @property + def wait(self): + """返回用于等待的对象""" + if self._wait is None: + self._wait = BrowserWaiter(self) + return self._wait + + @property + def tabs_count(self): + """返回标签页数量""" + j = self._run_cdp('Target.getTargets')['targetInfos'] # 不要改用get,避免卡死 + return len([i for i in j if i['type'] in ('page', 'webview') and not i['url'].startswith('devtools://')]) + + @property + def tab_ids(self): + """返回所有标签页id组成的列表""" + j = self._driver.get(f'http://{self.address}/json').json() # 不要改用cdp,因为顺序不对 + return [i['id'] for i in j if i['type'] in ('page', 'webview') and not i['url'].startswith('devtools://')] + + @property + def latest_tab(self): + """返回最新的标签页,最新标签页指最后创建或最后被激活的 + 当Settings.singleton_tab_obj==True时返回Tab对象,否则返回tab id""" + return self.get_tab(self.tab_ids[0], as_id=not Settings.singleton_tab_obj) + + def cookies(self, all_info=False): + """以list格式返回所有域名的cookies + :param all_info: 是否返回所有内容,False则只返回name, value, domain + :return: cookies组成的列表 + """ + cks = self._run_cdp(f'Storage.getCookies')['cookies'] + r = cks if all_info else [{'name': c['name'], 'value': c['value'], 'domain': c['domain']} for c in cks] + return CookiesList(r) + + def new_tab(self, url=None, new_window=False, background=False, new_context=False): + """新建一个标签页 + :param url: 新标签页跳转到的网址 + :param new_window: 是否在新窗口打开标签页 + :param background: 是否不激活新标签页,如new_window为True则无效 + :param new_context: 是否创建新的上下文 + :return: 新标签页对象 + """ + return self._new_tab(ChromiumTab, url=url, new_window=new_window, + background=background, new_context=new_context) + + def new_mix_tab(self, url=None, new_window=False, background=False, new_context=False): + """新建一个标签页 + :param url: 新标签页跳转到的网址 + :param new_window: 是否在新窗口打开标签页 + :param background: 是否不激活新标签页,如new_window为True则无效 + :param new_context: 是否创建新的上下文 + :return: 新标签页对象 + """ + return self._new_tab(MixTab, url=url, new_window=new_window, + background=background, new_context=new_context) + + def _new_tab(self, obj, url=None, new_window=False, background=False, new_context=False): + """新建一个标签页 + :param obj: 要创建的Tab类型 + :param url: 新标签页跳转到的网址 + :param new_window: 是否在新窗口打开标签页 + :param background: 是否不激活新标签页,如new_window为True则无效 + :param new_context: 是否创建新的上下文 + :return: 新标签页对象 + """ + tab = None + if new_context: + tab = self._run_cdp('Target.createBrowserContext')['browserContextId'] + + kwargs = {'url': ''} + if new_window: + kwargs['newWindow'] = True + if background: + kwargs['background'] = True + if tab: + kwargs['browserContextId'] = tab + + try: + tab = self._run_cdp('Target.createTarget', **kwargs)['targetId'] + except CDPError: + data = ('a', {'href': url or 'https://#', 'target': '_new' if new_window else '_blank'}) + tab = self.get_mix_tab() if isinstance(obj, MixTab) else self.get_tab() + return tab.add_ele(data).click.for_new_tab(by_js=True) + + while tab not in self._drivers: + sleep(.1) + tab = obj(self, tab) + if url: + tab.get(url) + return tab + + def get_tab(self, id_or_num=None, title=None, url=None, tab_type='page', as_id=False): + """获取一个标签页对象,id_or_num不为None时,后面几个参数无效 + :param id_or_num: 要获取的标签页id或序号,序号从1开始,可传入负数获取倒数第几个,不是视觉排列顺序,而是激活顺序 + :param title: 要匹配title的文本,模糊匹配,为None则匹配所有 + :param url: 要匹配url的文本,模糊匹配,为None则匹配所有 + :param tab_type: tab类型,可用列表输入多个,如 'page', 'iframe' 等,为None则匹配所有 + :param as_id: 是否返回标签页id而不是标签页对象 + :return: Tab对象 + """ + return self._get_tab(id_or_num=id_or_num, title=title, url=url, tab_type=tab_type, as_id=as_id) + + def get_tabs(self, title=None, url=None, tab_type='page', as_id=False): + """查找符合条件的tab,返回它们组成的列表,title和url是与关系 + :param title: 要匹配title的文本 + :param url: 要匹配url的文本 + :param tab_type: tab类型,可用列表输入多个 + :param as_id: 是否返回标签页id而不是标签页对象 + :return: Tab对象列表 + """ + return self._get_tabs(title=title, url=url, tab_type=tab_type, as_id=as_id) + + def get_mix_tab(self, id_or_num=None, title=None, url=None, tab_type='page', as_id=False): + """获取一个标签页对象,id_or_num不为None时,后面几个参数无效 + :param id_or_num: 要获取的标签页id或序号,序号从1开始,可传入负数获取倒数第几个,不是视觉排列顺序,而是激活顺序 + :param title: 要匹配title的文本,模糊匹配,为None则匹配所有 + :param url: 要匹配url的文本,模糊匹配,为None则匹配所有 + :param tab_type: tab类型,可用列表输入多个,如 'page', 'iframe' 等,为None则匹配所有 + :param as_id: 是否返回标签页id而不是标签页对象 + :return: Tab对象 + """ + return self._get_tab(id_or_num=id_or_num, title=title, url=url, tab_type=tab_type, mix=True, as_id=as_id) + + def get_mix_tabs(self, title=None, url=None, tab_type='page', as_id=False): + """查找符合条件的tab,返回它们组成的列表,title和url是与关系 + :param title: 要匹配title的文本 + :param url: 要匹配url的文本 + :param tab_type: tab类型,可用列表输入多个 + :param as_id: 是否返回标签页id而不是标签页对象 + :return: Tab对象列表 + """ + return self._get_tabs(title=title, url=url, tab_type=tab_type, mix=True, as_id=as_id) + + def _get_tab(self, id_or_num=None, title=None, url=None, tab_type='page', mix=False, as_id=False): + """获取一个标签页对象,id_or_num不为None时,后面几个参数无效 + :param id_or_num: 要获取的标签页id或序号,序号从1开始,可传入负数获取倒数第几个,不是视觉排列顺序,而是激活顺序 + :param title: 要匹配title的文本,模糊匹配,为None则匹配所有 + :param url: 要匹配url的文本,模糊匹配,为None则匹配所有 + :param tab_type: tab类型,可用列表输入多个,如 'page', 'iframe' 等,为None则匹配所有 + :param mix: 是否返回可切换模式的Tab对象 + :param as_id: 是否返回标签页id而不是标签页对象,mix=False时无效 + :return: Tab对象 + """ + if id_or_num is not None: + if isinstance(id_or_num, str): + id_or_num = id_or_num + elif isinstance(id_or_num, int): + id_or_num = self.tab_ids[id_or_num - 1 if id_or_num > 0 else id_or_num] + elif isinstance(id_or_num, ChromiumTab): + return id_or_num.tab_id if as_id else ChromiumTab(self, id_or_num.tab_id) + + elif title == url is None and tab_type == 'page': + id_or_num = self.tab_ids[0] + + else: + tabs = self._get_tabs(title=title, url=url, tab_type=tab_type, as_id=True) + if tabs: + id_or_num = tabs[0] + else: + return None + + if as_id: + return id_or_num + with self._lock: + return MixTab(self, id_or_num) if mix else ChromiumTab(self, id_or_num) + + def _get_tabs(self, title=None, url=None, tab_type='page', mix=False, as_id=False): + """查找符合条件的tab,返回它们组成的列表,title和url是与关系 + :param title: 要匹配title的文本 + :param url: 要匹配url的文本 + :param tab_type: tab类型,可用列表输入多个 + :param mix: 是否返回可切换模式的Tab对象 + :param as_id: 是否返回标签页id而不是标签页对象,mix=False时无效 + :return: Tab对象列表 + """ + tabs = self._driver.get(f'http://{self.address}/json').json() # 不要改用cdp + + if isinstance(tab_type, str): + tab_type = {tab_type} + elif isinstance(tab_type, (list, tuple, set)): + tab_type = set(tab_type) + elif tab_type is not None: + raise TypeError('tab_type只能是set、list、tuple、str、None。') + + tabs = [i for i in tabs if ((title is None or title in i['title']) and (url is None or url in i['url']) + and (tab_type is None or i['type'] in tab_type))] + if as_id: + return [tab['id'] for tab in tabs] + with self._lock: + if mix: + return [MixTab(self, tab['id']) for tab in tabs] + else: + return [ChromiumTab(self, tab['id']) for tab in tabs] + + def close_tabs(self, tabs_or_ids=None, others=False): + """关闭传入的标签页,默认关闭当前页。可传入多个 + :param tabs_or_ids: 要关闭的标签页对象或id,可传入列表或元组,为None时关闭最后操作的 + :param others: 是否关闭指定标签页之外的 + :return: None + """ + all_tabs = set(self.tab_ids) + if isinstance(tabs_or_ids, str): + tabs = {tabs_or_ids} + elif isinstance(tabs_or_ids, ChromiumTab): + tabs = {tabs_or_ids.tab_id} + elif tabs_or_ids is None: + tabs = {self.tab_ids[0]} + elif isinstance(tabs_or_ids, (list, tuple)): + tabs = set(i.tab_id if isinstance(i, ChromiumTab) else i for i in tabs_or_ids) + else: + raise TypeError('tabs_or_ids参数只能传入标签页对象或id。') + + if others: + tabs = all_tabs - tabs + + end_len = len(set(all_tabs) - set(tabs)) + if end_len <= 0: + self.quit() + return + + for tab in tabs: + self._onTargetDestroyed(targetId=tab) + self._driver.run('Target.closeTarget', targetId=tab) + sleep(.2) + end_time = perf_counter() + 3 + while self.tabs_count != end_len and perf_counter() < end_time: + sleep(.1) + + def activate_tab(self, id_ind_tab): + """使标签页变为活动状态 + :param id_ind_tab: 标签页id(str)、Tab对象或标签页序号(int),序号从1开始 + :return: None + """ + if isinstance(id_ind_tab, int): + id_ind_tab += -1 if id_ind_tab else 1 + id_ind_tab = self.tab_ids[id_ind_tab] + elif isinstance(id_ind_tab, ChromiumTab): + id_ind_tab = id_ind_tab.tab_id + self._run_cdp('Target.activateTarget', targetId=id_ind_tab) + + def reconnect(self): + """断开重连""" + self._driver.stop() + BrowserDriver.BROWSERS.pop(self.id) + self._driver = BrowserDriver(self.id, 'browser', self.address, self) + self._run_cdp('Target.setDiscoverTargets', discover=True) + self._driver.set_callback('Target.targetDestroyed', self._onTargetDestroyed) + self._driver.set_callback('Target.targetCreated', self._onTargetCreated) + + def quit(self, timeout=5, force=False, del_data=False): + """关闭浏览器 + :param timeout: 等待浏览器关闭超时时间(秒) + :param force: 是否立刻强制终止进程 + :param del_data: 是否删除用户文件夹 + :return: None + """ + try: + self._run_cdp('Browser.close') + except PageDisconnectedError: + pass + self._driver.stop() + + drivers = list(self._all_drivers.values()) + for tab in drivers: + for driver in tab: + driver.stop() + + if force: + pids = None + try: + pids = [pid['id'] for pid in self._run_cdp('SystemInfo.getProcessInfo')['processInfo']] + except: + pass + + if pids: + from psutil import Process + for pid in pids: + try: + Process(pid).kill() + except: + pass + + from os import popen + from platform import system + end_time = perf_counter() + timeout + while perf_counter() < end_time: + ok = True + for pid in pids: + txt = f'tasklist | findstr {pid}' if system().lower() == 'windows' else f'ps -ef | grep {pid}' + p = popen(txt) + sleep(.05) + try: + if f' {pid} ' in p.read(): + ok = False + break + except TypeError: + pass + + if ok: + break + + if del_data and not self._chromium_options.is_auto_port and self._chromium_options.user_data_path: + path = Path(self._chromium_options.user_data_path) + rmtree(path, True) + + def _get_driver(self, tab_id, owner=None): + """新建并返回指定tab id的Driver + :param tab_id: 标签页id + :param owner: 使用该驱动的对象 + :return: Driver对象 + """ + d = self._drivers.pop(tab_id, None) + if not d: + d = Driver(tab_id, 'page', self.address) + d.owner = owner + self._all_drivers.setdefault(tab_id, set()).add(d) + return d + + def _onTargetCreated(self, **kwargs): + """标签页创建时执行""" + if (kwargs['targetInfo']['type'] in ('page', 'webview') + and kwargs['targetInfo']['targetId'] not in self._all_drivers + and not kwargs['targetInfo']['url'].startswith('devtools://')): + try: + tab_id = kwargs['targetInfo']['targetId'] + d = Driver(tab_id, 'page', self.address) + self._drivers[tab_id] = d + self._all_drivers.setdefault(tab_id, set()).add(d) + except WebSocketBadStatusException: + pass + + def _onTargetDestroyed(self, **kwargs): + """标签页关闭时执行""" + tab_id = kwargs['targetId'] + self._dl_mgr.clear_tab_info(tab_id) + for key in [k for k, i in self._frames.items() if i == tab_id]: + self._frames.pop(key, None) + for d in self._all_drivers.get(tab_id, tuple()): + d.stop() + self._drivers.pop(tab_id, None) + self._all_drivers.pop(tab_id, None) + + def _run_cdp(self, cmd, **cmd_args): + """执行Chrome DevTools Protocol语句 + :param cmd: 协议项目 + :param cmd_args: 参数 + :return: 执行的结果 + """ + ignore = cmd_args.pop('_ignore', None) + r = self._driver.run(cmd, **cmd_args) + return r if __ERROR__ not in r else raise_error(r, ignore) + + def _on_disconnect(self): + Chromium._BROWSERS.pop(self.id, None) + if self._chromium_options.is_auto_port and self._chromium_options.user_data_path: + path = Path(self._chromium_options.user_data_path) + end_time = perf_counter() + 7 + while perf_counter() < end_time: + if not path.exists(): + break + try: + rmtree(path) + break + except (PermissionError, FileNotFoundError, OSError): + pass + sleep(.03) + + +def handle_options(addr_or_opts): + """设置浏览器启动属性 + :param addr_or_opts: 'ip:port'、ChromiumOptions、Driver + :return: 返回ChromiumOptions对象 + """ + if not addr_or_opts: + _chromium_options = ChromiumOptions(addr_or_opts) + if _chromium_options.is_auto_port: + port, path = PortFinder(_chromium_options.tmp_path).get_port(_chromium_options.is_auto_port) + _chromium_options.set_address(f'127.0.0.1:{port}') + _chromium_options.set_user_data_path(path) + _chromium_options.auto_port(scope=_chromium_options.is_auto_port) + + elif isinstance(addr_or_opts, ChromiumOptions): + if addr_or_opts.is_auto_port: + port, path = PortFinder(addr_or_opts.tmp_path).get_port(addr_or_opts.is_auto_port) + addr_or_opts.set_address(f'127.0.0.1:{port}') + addr_or_opts.set_user_data_path(path) + addr_or_opts.auto_port(scope=addr_or_opts.is_auto_port) + _chromium_options = addr_or_opts + + elif isinstance(addr_or_opts, str): + _chromium_options = ChromiumOptions() + _chromium_options.set_address(addr_or_opts) + + elif isinstance(addr_or_opts, int): + _chromium_options = ChromiumOptions() + _chromium_options.set_local_port(addr_or_opts) + + else: + raise TypeError('只能接收ip:port格式或ChromiumOptions类型参数。') + + return _chromium_options + + +def run_browser(chromium_options): + """连接浏览器""" + is_exists = connect_browser(chromium_options) + try: + s = Session() + s.trust_env = False + s.keep_alive = False + ws = s.get(f'http://{chromium_options.address}/json/version', headers={'Connection': 'close'}) + if not ws: + raise BrowserConnectError('\n浏览器连接失败,请确认浏览器是否启动。') + json = ws.json() + browser_id = json['webSocketDebuggerUrl'].split('/')[-1] + is_headless = 'headless' in json['User-Agent'].lower() + ws.close() + s.close() + except KeyError: + raise BrowserConnectError('浏览器版本太旧或此浏览器不支持接管。') + except: + raise BrowserConnectError('\n浏览器连接失败,请确认浏览器是否启动。') + return is_headless, browser_id, is_exists diff --git a/src/flaresolverr/DrissionPage/_base/browser.pyi b/src/flaresolverr/DrissionPage/_base/browser.pyi new file mode 100644 index 0000000000..bc8146977b --- /dev/null +++ b/src/flaresolverr/DrissionPage/_base/browser.pyi @@ -0,0 +1,167 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from threading import Lock +from typing import List, Optional, Set, Dict, Union, Tuple + +from .driver import BrowserDriver, Driver +from .._configs.chromium_options import ChromiumOptions +from .._configs.session_options import SessionOptions +from .._functions.cookies import CookiesList +from .._pages.chromium_base import Timeout +from .._pages.tabs import ChromiumTab, MixTab +from .._units.downloader import DownloadManager +from .._units.setter import BrowserSetter +from .._units.waiter import BrowserWaiter + + +class Chromium(object): + id: str = ... + address: str = ... + version: str = ... + retry_times: int = ... + retry_interval: float = ... + is_headless: bool = ... + + _BROWSERS: dict = ... + _chromium_options: ChromiumOptions = ... + _session_options: SessionOptions = ... + _driver: BrowserDriver = ... + _frames: dict = ... + _drivers: Dict[str, Driver] = ... + _all_drivers: Dict[str, Set[Driver]] = ... + _process_id: Optional[int] = ... + _dl_mgr: DownloadManager = ... + _lock: Lock = ... + + _set: Optional[BrowserSetter] = ... + _wait: Optional[BrowserWaiter] = ... + _timeouts: Timeout = ... + _load_mode: str = ... + _download_path: str = ... + _is_exists: bool = ... + + def __new__(cls, + addr_or_opts: Union[str, int, ChromiumOptions] = None, + session_options: Optional[SessionOptions] = None): ... + + def __init__(self, addr_or_opts: Union[str, int, ChromiumOptions] = None, + session_options: Optional[SessionOptions] = None): ... + + def _get_driver(self, tab_id: str, owner=None) -> Driver: ... + + def _run_cdp(self, cmd, **cmd_args) -> dict: ... + + @property + def user_data_path(self) -> str: ... + + @property + def process_id(self) -> Optional[int]: ... + + @property + def timeout(self) -> float: ... + + @property + def timeouts(self) -> Timeout: ... + + @property + def load_mode(self) -> str: ... + + @property + def download_path(self) -> str: ... + + @property + def set(self) -> BrowserSetter: ... + + @property + def wait(self) -> BrowserWaiter: ... + + @property + def tabs_count(self) -> int: ... + + @property + def tab_ids(self) -> List[str]: ... + + @property + def latest_tab(self) -> Union[ChromiumTab, str]: ... + + def cookies(self, all_info: bool = False) -> CookiesList: ... + + def close_tabs(self, + tabs_or_ids: Union[str, ChromiumTab, List[Union[str, ChromiumTab]], + Tuple[Union[str, ChromiumTab]]] = None, + others: bool = False) -> None: ... + + def get_tab(self, + id_or_num: Union[str, int] = None, + title: str = None, + url: str = None, + tab_type: str = 'page', + as_id: bool = False) -> Union[ChromiumTab, str]: ... + + def get_tabs(self, + title: str = None, + url: str = None, + tab_type: str = 'page', + as_id: bool = False) -> List[ChromiumTab, str]: ... + + def get_mix_tab(self, + id_or_num: Union[str, int] = None, + title: str = None, + url: str = None, + tab_type: str = 'page') -> Union[MixTab, str]: ... + + def get_mix_tabs(self, + title: str = None, + url: str = None, + tab_type: str = 'page') -> List[MixTab, str]: ... + + def _get_tab(self, + id_or_num: Union[str, int] = None, + title: str = None, + url: str = None, + tab_type: str = 'page', + mix: bool = False, + as_id: bool = False) -> Union[ChromiumTab, str]: ... + + def _get_tabs(self, + title: str = None, + url: str = None, + tab_type: str = 'page', + mix: bool = False, + as_id: bool = False) -> List[ChromiumTab, str]: ... + + def activate_tab(self, id_ind_tab: Union[int, str, ChromiumTab]) -> None: ... + + def _new_tab(self, + obj, + url: str = None, + new_window: bool = False, + background: bool = False, + new_context: bool = False) -> Union[ChromiumTab, MixTab]: ... + + def new_tab(self, + url: str = None, + new_window: bool = False, + background: bool = False, + new_context: bool = False) -> ChromiumTab: ... + + def new_mix_tab(self, + url: str = None, + new_window: bool = False, + background: bool = False, + new_context: bool = False) -> MixTab: ... + + def reconnect(self) -> None: ... + + def _onTargetCreated(self, **kwargs) -> None: ... + + def _onTargetDestroyed(self, **kwargs) -> None: ... + + def quit(self, timeout: float = 5, force: bool = False, del_data: bool = False) -> None: ... + + def _on_disconnect(self) -> None: ... diff --git a/src/flaresolverr/DrissionPage/_base/driver.py b/src/flaresolverr/DrissionPage/_base/driver.py new file mode 100644 index 0000000000..612f07f5d1 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_base/driver.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from json import dumps, loads, JSONDecodeError +from queue import Queue, Empty +from threading import Thread +from time import perf_counter, sleep + +from requests import adapters +from requests import Session +from websocket import (WebSocketTimeoutException, WebSocketConnectionClosedException, create_connection, + WebSocketException, WebSocketBadStatusException) + +from .._functions.settings import Settings +from ..errors import PageDisconnectedError, BrowserConnectError + +adapters.DEFAULT_RETRIES = 5 + + +class Driver(object): + def __init__(self, tab_id, tab_type, address, owner=None): + """ + :param tab_id: 标签页id + :param tab_type: 标签页类型 + :param address: 浏览器连接地址 + :param owner: 创建这个驱动的对象 + """ + self.id = tab_id + self.address = address + self.type = tab_type + self.owner = owner + # self._debug = True + # self._debug = False + self.alert_flag = False # 标记alert出现,跳过一条请求后复原 + + self._websocket_url = f'ws://{address}/devtools/{tab_type}/{tab_id}' + self._cur_id = 0 + self._ws = None + + self._recv_th = Thread(target=self._recv_loop) + self._handle_event_th = Thread(target=self._handle_event_loop) + self._recv_th.daemon = True + self._handle_event_th.daemon = True + self._handle_immediate_event_th = None + + self.is_running = False + + self.event_handlers = {} + self.immediate_event_handlers = {} + self.method_results = {} + self.event_queue = Queue() + self.immediate_event_queue = Queue() + + self.start() + + def _send(self, message, timeout=None): + """发送信息到浏览器,并返回浏览器返回的信息 + :param message: 发送给浏览器的数据 + :param timeout: 超时时间,为None表示无限 + :return: 浏览器返回的数据 + """ + self._cur_id += 1 + ws_id = self._cur_id + message['id'] = ws_id + message_json = dumps(message) + + # if self._debug: + # if self._debug is True or (isinstance(self._debug, str) and + # message.get('method', '').startswith(self._debug)): + # print(f'发> {message_json}') + # elif isinstance(self._debug, (list, tuple, set)): + # for m in self._debug: + # if message.get('method', '').startswith(m): + # print(f'发> {message_json}') + # break + + end_time = perf_counter() + timeout if timeout is not None else None + self.method_results[ws_id] = Queue() + try: + self._ws.send(message_json) + if timeout == 0: + self.method_results.pop(ws_id, None) + return {'id': ws_id, 'result': {}} + + except (OSError, WebSocketConnectionClosedException): + self.method_results.pop(ws_id, None) + return {'error': {'message': 'connection disconnected'}, 'type': 'connection_error'} + + while self.is_running: + try: + result = self.method_results[ws_id].get(timeout=.2) + self.method_results.pop(ws_id, None) + return result + + except Empty: + if self.alert_flag and message['method'].startswith(('Input.', 'Runtime.')): + return {'error': {'message': 'alert exists.'}, 'type': 'alert_exists'} + + if timeout is not None and perf_counter() > end_time: + self.method_results.pop(ws_id, None) + return {'error': {'message': 'alert exists.'}, 'type': 'alert_exists'} \ + if self.alert_flag else {'error': {'message': 'timeout'}, 'type': 'timeout'} + + continue + + return {'error': {'message': 'connection disconnected'}, 'type': 'connection_error'} + + def _recv_loop(self): + """接收浏览器信息的守护线程方法""" + while self.is_running: + try: + # self._ws.settimeout(1) + msg_json = self._ws.recv() + msg = loads(msg_json) + except WebSocketTimeoutException: + continue + except (WebSocketException, OSError, WebSocketConnectionClosedException, JSONDecodeError): + self._stop() + return + + # if self._debug: + # if self._debug is True or 'id' in msg or (isinstance(self._debug, str) + # and msg.get('method', '').startswith(self._debug)): + # print(f'<收 {msg_json}') + # elif isinstance(self._debug, (list, tuple, set)): + # for m in self._debug: + # if msg.get('method', '').startswith(m): + # print(f'<收 {msg_json}') + # break + + if 'method' in msg: + if msg['method'].startswith('Page.javascriptDialog'): + self.alert_flag = msg['method'].endswith('Opening') + function = self.immediate_event_handlers.get(msg['method']) + if function: + self._handle_immediate_event(function, msg['params']) + else: + self.event_queue.put(msg) + + elif msg.get('id') in self.method_results: + self.method_results[msg['id']].put(msg) + + # elif self._debug: + # print(f'未知信息:{msg}') + + def _handle_event_loop(self): + """当接收到浏览器信息,执行已绑定的方法""" + while self.is_running: + try: + event = self.event_queue.get(timeout=1) + except Empty: + continue + + function = self.event_handlers.get(event['method']) + if function: + function(**event['params']) + + self.event_queue.task_done() + + def _handle_immediate_event_loop(self): + while not self.immediate_event_queue.empty(): + function, kwargs = self.immediate_event_queue.get(timeout=1) + try: + function(**kwargs) + except PageDisconnectedError: + pass + + def _handle_immediate_event(self, function, kwargs): + """处理立即执行的动作 + :param function: 要运行下方法 + :param kwargs: 方法参数 + :return: None + """ + self.immediate_event_queue.put((function, kwargs)) + if self._handle_immediate_event_th is None or not self._handle_immediate_event_th.is_alive(): + self._handle_immediate_event_th = Thread(target=self._handle_immediate_event_loop) + self._handle_immediate_event_th.daemon = True + self._handle_immediate_event_th.start() + + def run(self, _method, **kwargs): + """执行cdp方法 + :param _method: cdp方法名 + :param kwargs: cdp参数 + :return: 执行结果 + """ + if not self.is_running: + return {'error': 'connection disconnected', 'type': 'connection_error'} + + timeout = kwargs.pop('_timeout', Settings.cdp_timeout) + result = self._send({'method': _method, 'params': kwargs}, timeout=timeout) + if 'result' not in result and 'error' in result: + kwargs['_timeout'] = timeout + return {'error': result['error']['message'], 'type': result.get('type', 'call_method_error'), + 'method': _method, 'args': kwargs, 'data': result['error'].get('data')} + else: + return result['result'] + + def start(self): + """启动连接""" + self.is_running = True + try: + self._ws = create_connection(self._websocket_url, enable_multithread=True, suppress_origin=True) + except WebSocketBadStatusException as e: + if 'Handshake status 403 Forbidden' in str(e): + raise RuntimeError('请升级websocket-client库。') + else: + return + except ConnectionRefusedError: + raise BrowserConnectError('浏览器未开启或已关闭。') + self._recv_th.start() + self._handle_event_th.start() + return True + + def stop(self): + """中断连接""" + self._stop() + while self._handle_event_th.is_alive() or self._recv_th.is_alive(): + sleep(.1) + return True + + def _stop(self): + """中断连接""" + if not self.is_running: + return False + + self.is_running = False + if self._ws: + self._ws.close() + self._ws = None + + # try: + # while not self.immediate_event_queue.empty(): + # function, kwargs = self.immediate_event_queue.get_nowait() + # try: + # function(**kwargs) + # except PageDisconnectedError: + # raise + # pass + # sleep(.1) + # + # while not self.event_queue.empty(): + # event = self.event_queue.get_nowait() + # function = self.event_handlers.get(event['method']) + # if function: + # function(**event['params']) + # sleep(.1) + # except: + # pass + + self.event_handlers.clear() + self.method_results.clear() + self.event_queue.queue.clear() + + if hasattr(self.owner, '_on_disconnect'): + self.owner._on_disconnect() + + def set_callback(self, event, callback, immediate=False): + """绑定cdp event和回调方法 + :param event: cdp event + :param callback: 绑定到cdp event的回调方法 + :param immediate: 是否要立即处理的动作 + :return: None + """ + handler = self.immediate_event_handlers if immediate else self.event_handlers + if callback: + handler[event] = callback + else: + handler.pop(event, None) + + +class BrowserDriver(Driver): + BROWSERS = {} + + def __new__(cls, tab_id, tab_type, address, owner): + if tab_id in cls.BROWSERS: + return cls.BROWSERS[tab_id] + return object.__new__(cls) + + def __init__(self, tab_id, tab_type, address, owner): + if hasattr(self, '_created'): + return + self._created = True + BrowserDriver.BROWSERS[tab_id] = self + super().__init__(tab_id, tab_type, address, owner) + + def __repr__(self): + return f'' + + def get(self, url): + s = Session() + s.trust_env = False + s.keep_alive = False + r = s.get(url, headers={'Connection': 'close'}) + r.close() + s.close() + return r diff --git a/src/flaresolverr/DrissionPage/_base/driver.pyi b/src/flaresolverr/DrissionPage/_base/driver.pyi new file mode 100644 index 0000000000..582f16a3be --- /dev/null +++ b/src/flaresolverr/DrissionPage/_base/driver.pyi @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from queue import Queue +from threading import Thread +from typing import Union, Callable, Dict, Optional + +from requests import Response +from websocket import WebSocket + +from .browser import Chromium + + +class GenericAttr(object): + def __init__(self, name: str, tab: Driver): ... + + def __getattr__(self, item: str) -> Callable: ... + + def __setattr__(self, key: str, value: Callable) -> None: ... + + +class Driver(object): + id: str + address: str + type: str + owner = ... + alert_flag: bool + _websocket_url: str + _cur_id: int + _ws: Optional[WebSocket] + _recv_th: Thread + _handle_event_th: Thread + _handle_immediate_event_th: Optional[Thread] + # _stopped: Event + is_running: bool + event_handlers: dict + immediate_event_handlers: dict + method_results: dict + event_queue: Queue + immediate_event_queue: Queue + + def __init__(self, tab_id: str, tab_type: str, address: str, owner=None): ... + + def _send(self, message: dict, timeout: float = None) -> dict: ... + + def _recv_loop(self) -> None: ... + + def _handle_event_loop(self) -> None: ... + + def _handle_immediate_event_loop(self): ... + + def _handle_immediate_event(self, function: Callable, kwargs: dict): ... + + def run(self, _method: str, **kwargs) -> dict: ... + + def start(self) -> bool: ... + + def stop(self) -> bool: ... + + def _stop(self) -> None: ... + + def set_callback(self, event: str, callback: Union[Callable, None], immediate: bool = False) -> None: ... + + +class BrowserDriver(Driver): + BROWSERS: Dict[str, Driver] = ... + owner: Chromium = ... + + def __new__(cls, tab_id: str, tab_type: str, address: str, owner: Chromium): ... + + def __init__(self, tab_id: str, tab_type: str, address: str, owner: Chromium): ... + + def get(self, url) -> Response: ... diff --git a/src/flaresolverr/DrissionPage/_configs/chromium_options.py b/src/flaresolverr/DrissionPage/_configs/chromium_options.py new file mode 100644 index 0000000000..20b79dbe10 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/chromium_options.py @@ -0,0 +1,592 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from pathlib import Path +from re import search + +from .options_manage import OptionsManager + + +class ChromiumOptions(object): + def __init__(self, read_file=True, ini_path=None): + """ + :param read_file: 是否从默认ini文件中读取配置信息 + :param ini_path: ini文件路径,为None则读取默认ini文件 + """ + self._user_data_path = None + self._user = 'Default' + self._prefs_to_del = [] + self.clear_file_flags = False + self._is_headless = False + self._ua_set = False + + if read_file is False: + ini_path = False + self.ini_path = None + elif ini_path: + ini_path = Path(ini_path).absolute() + if not ini_path.exists(): + raise ValueError(f'文件不存在:{ini_path}') + self.ini_path = str(ini_path) + else: + self.ini_path = str(Path(__file__).parent / 'configs.ini') + + om = OptionsManager(ini_path) + options = om.chromium_options + self._download_path = om.paths.get('download_path', '.') or '.' + self._tmp_path = om.paths.get('tmp_path', None) or None + self._arguments = options.get('arguments', []) + self._browser_path = options.get('browser_path', '') + self._extensions = options.get('extensions', []) + self._prefs = options.get('prefs', {}) + self._flags = options.get('flags', {}) + self._address = options.get('address', None) + self._load_mode = options.get('load_mode', 'normal') + self._system_user_path = options.get('system_user_path', False) + self._existing_only = options.get('existing_only', False) + self._new_env = options.get('new_env', False) + for i in self._arguments: + if i.startswith('--headless'): + self._is_headless = True + break + + self._proxy = om.proxies.get('http', None) or om.proxies.get('https', None) + + user_path = user = False + for arg in self._arguments: + if arg.startswith('--user-data-dir='): + self.set_paths(user_data_path=arg[16:]) + user_path = True + if arg.startswith('--profile-directory='): + self.set_user(arg[20:]) + user = True + if user and user_path: + break + + timeouts = om.timeouts + self._timeouts = {'base': timeouts['base'], + 'page_load': timeouts['page_load'], + 'script': timeouts['script']} + + self._auto_port = options.get('auto_port', False) + + others = om.others + self._retry_times = others.get('retry_times', 3) + self._retry_interval = others.get('retry_interval', 2) + + return + + @property + def download_path(self): + """默认下载路径文件路径""" + return self._download_path + + @property + def browser_path(self): + """浏览器启动文件路径""" + return self._browser_path + + @property + def user_data_path(self): + """返回用户数据文件夹路径""" + return self._user_data_path + + @property + def tmp_path(self): + """返回临时文件夹路径""" + return self._tmp_path + + @property + def user(self): + """返回用户配置文件夹名称""" + return self._user + + @property + def load_mode(self): + """返回页面加载策略,'normal', 'eager', 'none'""" + return self._load_mode + + @property + def timeouts(self): + """返回timeouts设置""" + return self._timeouts + + @property + def proxy(self): + """返回代理设置""" + return self._proxy + + @property + def address(self): + """返回浏览器地址,ip:port""" + return self._address + + @property + def arguments(self): + """返回浏览器命令行设置列表""" + return self._arguments + + @property + def extensions(self): + """以list形式返回要加载的插件路径""" + return self._extensions + + @property + def preferences(self): + """返回用户首选项配置""" + return self._prefs + + @property + def flags(self): + """返回实验项配置""" + return self._flags + + @property + def system_user_path(self): + """返回是否使用系统安装的浏览器所使用的用户数据文件夹""" + return self._system_user_path + + @property + def is_existing_only(self): + """返回是否只接管现有浏览器方式""" + return self._existing_only + + @property + def is_auto_port(self): + """返回是否使用自动端口和用户文件,如指定范围则返回范围tuple""" + return self._auto_port + + @property + def retry_times(self): + """返回连接失败时的重试次数""" + return self._retry_times + + @property + def retry_interval(self): + """返回连接失败时的重试间隔(秒)""" + return self._retry_interval + + @property + def is_headless(self): + """返回是否无头模式""" + return self._is_headless + + def set_retry(self, times=None, interval=None): + """设置连接失败时的重试操作 + :param times: 重试次数 + :param interval: 重试间隔 + :return: 当前对象 + """ + if times is not None: + self._retry_times = times + if interval is not None: + self._retry_interval = interval + return self + + def set_argument(self, arg, value=None): + """设置浏览器配置的argument属性 + :param arg: 属性名 + :param value: 属性值,有值的属性传入值,没有的传入None,如传入False,删除该项 + :return: 当前对象 + """ + self.remove_argument(arg) + if value is not False: + if arg == '--headless': + if value == 'false': + self._is_headless = False + else: + if value is None: + value = 'new' + self._arguments.append(f'--headless={value}') + self._is_headless = True + else: + arg_str = arg if value is None else f'{arg}={value}' + self._arguments.append(arg_str) + elif arg == '--headless': + self._is_headless = False + return self + + def remove_argument(self, value): + """移除一个argument项 + :param value: 设置项名,有值的设置项传入设置名称即可 + :return: 当前对象 + """ + elements_to_delete = [arg for arg in self._arguments if arg == value or arg.startswith(f'{value}=')] + if not elements_to_delete: + return self + + if len(elements_to_delete) == 1: + self._arguments.remove(elements_to_delete[0]) + else: + self._arguments = [arg for arg in self._arguments if arg not in elements_to_delete] + + return self + + def add_extension(self, path): + """添加插件 + :param path: 插件路径,可指向文件夹 + :return: 当前对象 + """ + path = Path(path) + if not path.exists(): + raise OSError('插件路径不存在。') + self._extensions.append(str(path)) + return self + + def remove_extensions(self): + """移除所有插件 + :return: 当前对象 + """ + self._extensions = [] + return self + + def set_pref(self, arg, value): + """设置Preferences文件中的用户设置项 + :param arg: 设置项名称 + :param value: 设置项值 + :return: 当前对象 + """ + self._prefs[arg] = value + return self + + def remove_pref(self, arg): + """删除用户首选项设置,不能删除已设置到文件中的项 + :param arg: 设置项名称 + :return: 当前对象 + """ + self._prefs.pop(arg, None) + return self + + def remove_pref_from_file(self, arg): + """删除用户配置文件中已设置的项 + :param arg: 设置项名称 + :return: 当前对象 + """ + self._prefs_to_del.append(arg) + return self + + def set_flag(self, flag, value=None): + """设置实验项 + :param flag: 设置项名称 + :param value: 设置项的值,为False则删除该项 + :return: 当前对象 + """ + if value is False: + self._flags.pop(flag, None) + else: + self._flags[flag] = value + return self + + def clear_flags_in_file(self): + """删除浏览器配置文件中已设置的实验项""" + self.clear_file_flags = True + return self + + def clear_flags(self): + """清空本对象已设置的flag参数""" + self._flags = {} + return self + + def clear_arguments(self): + """清空本对象已设置的argument参数""" + self._arguments = [] + return self + + def clear_prefs(self): + """清空本对象已设置的pref参数""" + self._prefs = {} + return self + + def set_timeouts(self, base=None, page_load=None, script=None): + """设置超时时间,单位为秒 + :param base: 默认超时时间 + :param page_load: 页面加载超时时间 + :param script: 脚本运行超时时间 + :return: 当前对象 + """ + if base is not None: + self._timeouts['base'] = base + if page_load is not None: + self._timeouts['page_load'] = page_load + if script is not None: + self._timeouts['script'] = script + + return self + + def set_user(self, user='Default'): + """设置使用哪个用户配置文件夹 + :param user: 用户文件夹名称 + :return: 当前对象 + """ + self.set_argument('--profile-directory', user) + self._user = user + return self + + def headless(self, on_off=True): + """设置是否隐藏浏览器界面 + :param on_off: 开或关 + :return: 当前对象 + """ + on_off = 'new' if on_off else on_off + return self.set_argument('--headless', on_off) + + def no_imgs(self, on_off=True): + """设置是否加载图片 + :param on_off: 开或关 + :return: 当前对象 + """ + on_off = None if on_off else False + return self.set_argument('--blink-settings=imagesEnabled=false', on_off) + + def no_js(self, on_off=True): + """设置是否禁用js + :param on_off: 开或关 + :return: 当前对象 + """ + on_off = None if on_off else False + return self.set_argument('--disable-javascript', on_off) + + def mute(self, on_off=True): + """设置是否静音 + :param on_off: 开或关 + :return: 当前对象 + """ + on_off = None if on_off else False + return self.set_argument('--mute-audio', on_off) + + def incognito(self, on_off=True): + """设置是否使用无痕模式启动 + :param on_off: 开或关 + :return: 当前对象 + """ + on_off = None if on_off else False + return self.set_argument('--incognito', on_off) + + def new_env(self, on_off=True): + """设置是否使用全新浏览器环境 + :param on_off: 开或关 + :return: 当前对象 + """ + self._new_env = on_off + return self + + def ignore_certificate_errors(self, on_off=True): + """设置是否忽略证书错误 + :param on_off: 开或关 + :return: 当前对象 + """ + on_off = None if on_off else False + return self.set_argument('--ignore-certificate-errors', on_off) + + def set_user_agent(self, user_agent): + """设置user agent + :param user_agent: user agent文本 + :return: 当前对象 + """ + return self.set_argument('--user-agent', user_agent) + + def set_proxy(self, proxy): + """设置代理 + :param proxy: 代理url和端口 + :return: 当前对象 + """ + if search(r'.*?:.*?@.*?\..*', proxy): + print('你似乎在设置使用账号密码的代理,暂时不支持这种代理,可自行用插件实现需求。') + if proxy.lower().startswith('socks'): + print('你似乎在设置使用socks代理,暂时不支持这种代理,可自行用插件实现需求。') + self._proxy = proxy + return self.set_argument('--proxy-server', proxy) + + def set_load_mode(self, value): + """设置load_mode,可接收 'normal', 'eager', 'none' + normal:默认情况下使用, 等待所有资源下载完成 + eager:DOM访问已准备就绪, 但其他资源 (如图像) 可能仍在加载中 + none:完全不阻塞 + :param value: 可接收 'normal', 'eager', 'none' + :return: 当前对象 + """ + if value not in ('normal', 'eager', 'none'): + raise ValueError("只能选择 'normal', 'eager', 'none'。") + self._load_mode = value.lower() + return self + + def set_paths(self, browser_path=None, local_port=None, address=None, download_path=None, + user_data_path=None, cache_path=None): + """快捷的路径设置函数 + :param browser_path: 浏览器可执行文件路径 + :param local_port: 本地端口号 + :param address: 调试浏览器地址,例:127.0.0.1:9222 + :param download_path: 下载文件路径 + :param user_data_path: 用户数据路径 + :param cache_path: 缓存路径 + :return: 当前对象 + """ + if browser_path is not None: + self.set_browser_path(browser_path) + + if local_port is not None: + self.set_local_port(local_port) + + if address is not None: + self.set_address(address) + + if download_path is not None: + self.set_download_path(download_path) + + if user_data_path is not None: + self.set_user_data_path(user_data_path) + + if cache_path is not None: + self.set_cache_path(cache_path) + + return self + + def set_local_port(self, port): + """设置本地启动端口 + :param port: 端口号 + :return: 当前对象 + """ + self._address = f'127.0.0.1:{port}' + self._auto_port = False + return self + + def set_address(self, address): + """设置浏览器地址,格式'ip:port' + :param address: 浏览器地址 + :return: 当前对象 + """ + address = address.replace('localhost', '127.0.0.1').lstrip('http://').lstrip('https://') + self._address = address + return self + + def set_browser_path(self, path): + """设置浏览器可执行文件路径 + :param path: 浏览器路径 + :return: 当前对象 + """ + self._browser_path = str(path) + return self + + def set_download_path(self, path): + """设置下载文件保存路径 + :param path: 下载路径 + :return: 当前对象 + """ + self._download_path = '.' if path is None else str(path) + return self + + def set_tmp_path(self, path): + """设置临时文件文件保存路径 + :param path: 下载路径 + :return: 当前对象 + """ + self._tmp_path = str(path) + return self + + def set_user_data_path(self, path): + """设置用户文件夹路径 + :param path: 用户文件夹路径 + :return: 当前对象 + """ + u = str(path) + self.set_argument('--user-data-dir', u) + self._user_data_path = u + self._auto_port = False + return self + + def set_cache_path(self, path): + """设置缓存路径 + :param path: 缓存路径 + :return: 当前对象 + """ + self.set_argument('--disk-cache-dir', str(path)) + return self + + def use_system_user_path(self, on_off=True): + """设置是否使用系统安装的浏览器默认用户文件夹 + :param on_off: 开或关 + :return: 当前对象 + """ + self._system_user_path = on_off + return self + + def auto_port(self, on_off=True, scope=None): + """自动获取可用端口 + :param on_off: 是否开启自动获取端口号 + :param scope: 指定端口范围,不含最后的数字,为None则使用[9600-59600) + :return: 当前对象 + """ + if on_off: + self._auto_port = scope if scope else (9600, 59600) + else: + self._auto_port = False + return self + + def existing_only(self, on_off=True): + """设置只接管已有浏览器,不自动启动新的 + :param on_off: 是否开启自动获取端口号 + :return: 当前对象 + """ + self._existing_only = on_off + return self + + def save(self, path=None): + """保存设置到文件 + :param path: ini文件的路径, None 保存到当前读取的配置文件,传入 'default' 保存到默认ini文件 + :return: 保存文件的绝对路径 + """ + if path == 'default': + path = (Path(__file__).parent / 'configs.ini').absolute() + + elif path is None: + if self.ini_path: + path = Path(self.ini_path).absolute() + else: + path = (Path(__file__).parent / 'configs.ini').absolute() + + else: + path = Path(path).absolute() + + path = path / 'config.ini' if path.is_dir() else path + + if path.exists(): + om = OptionsManager(path) + else: + om = OptionsManager(self.ini_path or (Path(__file__).parent / 'configs.ini')) + + # 设置chromium_options + attrs = ('address', 'browser_path', 'arguments', 'extensions', 'user', 'load_mode', + 'auto_port', 'system_user_path', 'existing_only', 'flags', 'new_env') + for i in attrs: + om.set_item('chromium_options', i, self.__getattribute__(f'_{i}')) + # 设置代理 + om.set_item('proxies', 'http', self._proxy or '') + om.set_item('proxies', 'https', self._proxy or '') + # 设置路径 + om.set_item('paths', 'download_path', self._download_path or '') + om.set_item('paths', 'tmp_path', self._tmp_path or '') + # 设置timeout + om.set_item('timeouts', 'base', self._timeouts['base']) + om.set_item('timeouts', 'page_load', self._timeouts['page_load']) + om.set_item('timeouts', 'script', self._timeouts['script']) + # 设置重试 + om.set_item('others', 'retry_times', self.retry_times) + om.set_item('others', 'retry_interval', self.retry_interval) + # 设置prefs + om.set_item('chromium_options', 'prefs', self._prefs) + + path = str(path) + om.save(path) + + return path + + def save_to_default(self): + """保存当前配置到默认ini文件""" + return self.save('default') + + def __repr__(self): + return f'' diff --git a/src/flaresolverr/DrissionPage/_configs/chromium_options.pyi b/src/flaresolverr/DrissionPage/_configs/chromium_options.pyi new file mode 100644 index 0000000000..7399e083b9 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/chromium_options.pyi @@ -0,0 +1,179 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from pathlib import Path +from typing import Union, Any, Literal, Optional, Tuple + + +class ChromiumOptions(object): + ini_path: Optional[str] = ... + _driver_path: str = ... + _user_data_path: Optional[str] = ... + _download_path: str = ... + _tmp_path: str = ... + _arguments: list = ... + _browser_path: str = ... + _user: str = ... + _load_mode: str = ... + _timeouts: dict = ... + _proxy: str = ... + _address: str = ... + _extensions: list = ... + _prefs: dict = ... + _flags: dict = ... + _prefs_to_del: list = ... + _new_env: bool = ... + clear_file_flags: bool = ... + _auto_port: Union[Tuple[int, int], False] = ... + _system_user_path: bool = ... + _existing_only: bool = ... + _retry_times: int = ... + _retry_interval: float = ... + _is_headless: bool = ... + _ua_set: bool = ... + + def __init__(self, read_file: [bool, None] = True, ini_path: Union[str, Path] = None): ... + + @property + def download_path(self) -> str: ... + + @property + def browser_path(self) -> str: ... + + @property + def user_data_path(self) -> str: ... + + @property + def tmp_path(self) -> Optional[str]: ... + + @property + def user(self) -> str: ... + + @property + def load_mode(self) -> str: ... + + @property + def timeouts(self) -> dict: ... + + @property + def proxy(self) -> str: ... + + @property + def address(self) -> str: ... + + @property + def arguments(self) -> list: ... + + @property + def extensions(self) -> list: ... + + @property + def preferences(self) -> dict: ... + + @property + def flags(self) -> dict: ... + + @property + def system_user_path(self) -> bool: ... + + @property + def is_existing_only(self) -> bool: ... + + @property + def is_auto_port(self) -> Union[bool, Tuple[int, int]]: ... + + @property + def retry_times(self) -> int: ... + + @property + def retry_interval(self) -> float: ... + + @property + def is_headless(self) -> bool: ... + + def set_retry(self, times: int = None, interval: float = None) -> ChromiumOptions: ... + + def set_argument(self, arg: str, value: Union[str, None, bool] = None) -> ChromiumOptions: ... + + def remove_argument(self, value: str) -> ChromiumOptions: ... + + def add_extension(self, path: Union[str, Path]) -> ChromiumOptions: ... + + def remove_extensions(self) -> ChromiumOptions: ... + + def set_pref(self, arg: str, value: Any) -> ChromiumOptions: ... + + def remove_pref(self, arg: str) -> ChromiumOptions: ... + + def remove_pref_from_file(self, arg: str) -> ChromiumOptions: ... + + def set_flag(self, flag: str, value: Union[int, str, bool] = None) -> ChromiumOptions: ... + + def clear_flags_in_file(self) -> ChromiumOptions: ... + + def clear_flags(self) -> ChromiumOptions: ... + + def clear_arguments(self) -> ChromiumOptions: ... + + def clear_prefs(self) -> ChromiumOptions: ... + + def set_timeouts(self, + base: float = None, + page_load: float = None, + script: float = None) -> ChromiumOptions: ... + + def set_user(self, user: str = 'Default') -> ChromiumOptions: ... + + def headless(self, on_off: bool = True) -> ChromiumOptions: ... + + def no_imgs(self, on_off: bool = True) -> ChromiumOptions: ... + + def no_js(self, on_off: bool = True) -> ChromiumOptions: ... + + def mute(self, on_off: bool = True) -> ChromiumOptions: ... + + def incognito(self, on_off: bool = True) -> ChromiumOptions: ... + + def new_env(self, on_off: bool = True) -> ChromiumOptions: ... + + def set_user_agent(self, user_agent: str) -> ChromiumOptions: ... + + def set_proxy(self, proxy: str) -> ChromiumOptions: ... + + def ignore_certificate_errors(self, on_off=True) -> ChromiumOptions: ... + + def set_load_mode(self, value: Literal['normal', 'eager', 'none']) -> ChromiumOptions: ... + + def set_browser_path(self, path: Union[str, Path]) -> ChromiumOptions: ... + + def set_local_port(self, port: Union[str, int]) -> ChromiumOptions: ... + + def set_address(self, address: str) -> ChromiumOptions: ... + + def set_download_path(self, path: Union[str, Path]) -> ChromiumOptions: ... + + def set_tmp_path(self, path: Union[str, Path]) -> ChromiumOptions: ... + + def set_user_data_path(self, path: Union[str, Path]) -> ChromiumOptions: ... + + def set_cache_path(self, path: Union[str, Path]) -> ChromiumOptions: ... + + def set_paths(self, browser_path: Union[str, Path] = None, local_port: Union[int, str] = None, + address: str = None, download_path: Union[str, Path] = None, user_data_path: Union[str, Path] = None, + cache_path: Union[str, Path] = None) -> ChromiumOptions: ... + + def use_system_user_path(self, on_off: bool = True) -> ChromiumOptions: ... + + def auto_port(self, + on_off: bool = True, + scope: Tuple[int, int] = None) -> ChromiumOptions: ... + + def existing_only(self, on_off: bool = True) -> ChromiumOptions: ... + + def save(self, path: Union[str, Path] = None) -> str: ... + + def save_to_default(self) -> str: ... diff --git a/src/flaresolverr/DrissionPage/_configs/configs.ini b/src/flaresolverr/DrissionPage/_configs/configs.ini new file mode 100644 index 0000000000..d076357430 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/configs.ini @@ -0,0 +1,33 @@ +[paths] +download_path = +tmp_path = + +[chromium_options] +address = 127.0.0.1:9222 +browser_path = chrome +arguments = ['--no-default-browser-check', '--disable-suggestions-ui', '--no-first-run', '--disable-infobars', '--disable-popup-blocking', '--hide-crash-restore-bubble', '--disable-features=PrivacySandboxSettings4'] +extensions = [] +prefs = {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}} +flags = {} +load_mode = normal +user = Default +auto_port = False +system_user_path = False +existing_only = False +new_env = False + +[session_options] +headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'} + +[timeouts] +base = 10 +page_load = 30 +script = 30 + +[proxies] +http = +https = + +[others] +retry_times = 3 +retry_interval = 2 diff --git a/src/flaresolverr/DrissionPage/_configs/options_manage.py b/src/flaresolverr/DrissionPage/_configs/options_manage.py new file mode 100644 index 0000000000..a6aff57a4d --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/options_manage.py @@ -0,0 +1,174 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from configparser import RawConfigParser, NoSectionError, NoOptionError +from pathlib import Path +from pprint import pprint + + +class OptionsManager(object): + """管理配置文件内容的类""" + + def __init__(self, path=None): + """初始化,读取配置文件,如没有设置临时文件夹,则设置并新建 + :param path: ini文件的路径,为None则找项目文件夹下的,找不到则读取模块文件夹下的 + """ + if path is False: + self.ini_path = None + else: + default_configs = Path(__file__).parent / 'configs.ini' + if path is None: + dp_configs = Path('dp_configs.ini') + if dp_configs.exists(): + self.ini_path = dp_configs + else: + self.ini_path = default_configs + elif path == 'default': + self.ini_path = default_configs + elif isinstance(path, Path): + self.ini_path = path + else: + self.ini_path = Path(path) + + self._conf = RawConfigParser() + if path is not False and self.ini_path.exists(): + self.file_exists = True + self._conf.read(self.ini_path, encoding='utf-8') + else: + self.file_exists = False + self._conf.add_section('paths') + self._conf.add_section('chromium_options') + self._conf.add_section('session_options') + self._conf.add_section('timeouts') + self._conf.add_section('proxies') + self._conf.add_section('others') + self.set_item('paths', 'download_path', '') + self.set_item('paths', 'tmp_path', '') + self.set_item('chromium_options', 'address', '127.0.0.1:9222') + self.set_item('chromium_options', 'browser_path', 'chrome') + self.set_item('chromium_options', 'arguments', "['--no-default-browser-check', '--disable-suggestions-ui', " + "'--no-first-run', '--disable-infobars', " + "'--disable-popup-blocking', '--hide-crash-restore-bubble', " + "'--disable-features=PrivacySandboxSettings4']") + self.set_item('chromium_options', 'extensions', '[]') + self.set_item('chromium_options', 'prefs', "{'profile.default_content_settings.popups': 0, " + "'profile.default_content_setting_values': " + "{'notifications': 2}}") + self.set_item('chromium_options', 'flags', '{}') + self.set_item('chromium_options', 'load_mode', 'normal') + self.set_item('chromium_options', 'user', 'Default') + self.set_item('chromium_options', 'auto_port', 'False') + self.set_item('chromium_options', 'system_user_path', 'False') + self.set_item('chromium_options', 'existing_only', 'False') + self.set_item('chromium_options', 'new_env', 'False') + self.set_item('session_options', 'headers', "{'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X " + "10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10." + "1.2 Safari/603.3.8', 'accept': 'text/html,application/xhtml" + "+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': " + "'keep-alive', 'accept-charset': 'GB2312,utf-8;q=0.7,*;q=0.7'}") + self.set_item('timeouts', 'base', '10') + self.set_item('timeouts', 'page_load', '30') + self.set_item('timeouts', 'script', '30') + self.set_item('proxies', 'http', '') + self.set_item('proxies', 'https', '') + self.set_item('others', 'retry_times', '3') + self.set_item('others', 'retry_interval', '2') + + def __getattr__(self, item): + """以dict形似返回获取大项信息 + :param item: 项名 + :return: None + """ + return self.get_option(item) + + def get_value(self, section, item): + """获取配置的值 + :param section: 段名 + :param item: 项名 + :return: 项值 + """ + try: + return eval(self._conf.get(section, item)) + except (SyntaxError, NameError): + return self._conf.get(section, item) + except NoSectionError and NoOptionError: + return None + + def get_option(self, section): + """把section内容以字典方式返回 + :param section: 段名 + :return: 段内容生成的字典 + """ + items = self._conf.items(section) + option = dict() + + for j in items: + try: + option[j[0]] = eval(self._conf.get(section, j[0])) + except Exception: + option[j[0]] = self._conf.get(section, j[0]) + + return option + + def set_item(self, section, item, value): + """设置配置值 + :param section: 段名 + :param item: 项名 + :param value: 项值 + :return: None + """ + self._conf.set(section, item, str(value)) + self.__setattr__(f'_{section}', None) + return self + + def remove_item(self, section, item): + """删除配置值 + :param section: 段名 + :param item: 项名 + :return: None + """ + self._conf.remove_option(section, item) + return self + + def save(self, path=None): + """保存配置文件 + :param path: ini文件的路径,传入 'default' 保存到默认ini文件 + :return: 保存路径 + """ + default_path = (Path(__file__).parent / 'configs.ini').absolute() + if path == 'default': + path = default_path + elif path is None: + if self.ini_path is None: + raise ValueError('ini_path未设置。') + path = self.ini_path.absolute() + else: + path = Path(path).absolute() + + path = path / 'config.ini' if path.is_dir() else path + path.parent.mkdir(exist_ok=True, parents=True) + + path = str(path) + self._conf.write(open(path, 'w', encoding='utf-8')) + + print(f'配置已保存到文件:{path}') + if path == str(default_path): + print('以后程序可自动从文件加载配置。') + + self.file_exists = True + return path + + def save_to_default(self): + """保存当前配置到默认ini文件""" + return self.save('default') + + def show(self): + """打印所有设置信息""" + for i in self._conf.sections(): + print(f'[{i}]') + pprint(self.get_option(i)) + print() diff --git a/src/flaresolverr/DrissionPage/_configs/options_manage.pyi b/src/flaresolverr/DrissionPage/_configs/options_manage.pyi new file mode 100644 index 0000000000..3730dbc284 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/options_manage.pyi @@ -0,0 +1,34 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from configparser import RawConfigParser +from pathlib import Path +from typing import Any, Optional, Union + + +class OptionsManager(object): + ini_path: Optional[Path] = ... + file_exists: bool = ... + _conf: RawConfigParser = ... + + def __init__(self, path: Union[Path, str] = None): ... + + def __getattr__(self, item) -> dict: ... + + def get_value(self, section: str, item: str) -> Any: ... + + def get_option(self, section: str) -> dict: ... + + def set_item(self, section: str, item: str, value: Any) -> None: ... + + def remove_item(self, section: str, item: str) -> None: ... + + def save(self, path: str = None) -> str: ... + + def save_to_default(self) -> str: ... + + def show(self) -> None: ... diff --git a/src/flaresolverr/DrissionPage/_configs/session_options.py b/src/flaresolverr/DrissionPage/_configs/session_options.py new file mode 100644 index 0000000000..2b080f85e8 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/session_options.py @@ -0,0 +1,485 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from copy import copy +from pathlib import Path + +from requests import Session +from requests.structures import CaseInsensitiveDict + +from .options_manage import OptionsManager +from .._functions.cookies import cookies_to_tuple, set_session_cookies +from .._functions.web import format_headers + + +class SessionOptions(object): + """requests的Session对象配置类""" + + def __init__(self, read_file=True, ini_path=None): + """ + :param read_file: 是否从文件读取配置 + :param ini_path: ini文件路径 + """ + self.ini_path = None + self._download_path = '.' + self._timeout = 10 + self._del_set = set() # 记录要从ini文件删除的参数 + + if read_file is False: + ini_path = False + self.ini_path = None + elif ini_path: + ini_path = Path(ini_path).absolute() + if not ini_path.exists(): + raise ValueError(f'文件不存在:{ini_path}') + self.ini_path = str(ini_path) + else: + self.ini_path = str(Path(__file__).parent / 'configs.ini') + om = OptionsManager(ini_path) + + self._headers = None + self._cookies = None + self._auth = None + self._proxies = None + self._hooks = None + self._params = None + self._verify = None + self._cert = None + self._adapters = None + self._stream = None + self._trust_env = None + self._max_redirects = None + + options = om.session_options + if options.get('headers', None) is not None: + self.set_headers(options['headers']) + + if options.get('cookies', None) is not None: + self.set_cookies(options['cookies']) + + if options.get('auth', None) is not None: + self._auth = options['auth'] + + if options.get('params', None) is not None: + self._params = options['params'] + + if options.get('verify', None) is not None: + self._verify = options['verify'] + + if options.get('cert', None) is not None: + self._cert = options['cert'] + + if options.get('stream', None) is not None: + self._stream = options['stream'] + + if options.get('trust_env', None) is not None: + self._trust_env = options['trust_env'] + + if options.get('max_redirects', None) is not None: + self._max_redirects = options['max_redirects'] + + self.set_proxies(om.proxies.get('http', None), om.proxies.get('https', None)) + self._timeout = om.timeouts.get('base', 10) + self._download_path = om.paths.get('download_path', '.') or '.' + + others = om.others + self._retry_times = others.get('retry_times', 3) + self._retry_interval = others.get('retry_interval', 2) + + # ===========须独立处理的项开始============ + @property + def download_path(self): + """返回默认下载路径属性信息""" + return self._download_path + + def set_download_path(self, path): + """设置默认下载路径 + :param path: 下载路径 + :return: 返回当前对象 + """ + self._download_path = '.' if path is None else str(path) + return self + + @property + def timeout(self): + """返回timeout属性信息""" + return self._timeout + + def set_timeout(self, second): + """设置超时信息 + :param second: 秒数 + :return: 返回当前对象 + """ + self._timeout = second + return self + + @property + def proxies(self): + """返回proxies设置信息""" + if self._proxies is None: + self._proxies = {} + return self._proxies + + def set_proxies(self, http=None, https=None): + """设置proxies参数 + :param http: http代理地址 + :param https: https代理地址 + :return: 返回当前对象 + """ + self._sets('proxies', {'http': http, 'https': https}) + return self + + @property + def retry_times(self): + """返回连接失败时的重试次数""" + return self._retry_times + + @property + def retry_interval(self): + """返回连接失败时的重试间隔(秒)""" + return self._retry_interval + + def set_retry(self, times=None, interval=None): + """设置连接失败时的重试操作 + :param times: 重试次数 + :param interval: 重试间隔 + :return: 当前对象 + """ + if times is not None: + self._retry_times = times + if interval is not None: + self._retry_interval = interval + return self + + # ===========须独立处理的项结束============ + + @property + def headers(self): + """返回headers设置信息""" + if self._headers is None: + self._headers = {} + return self._headers + + def set_headers(self, headers): + """设置headers参数 + :param headers: 参数值,传入None可在ini文件标记删除 + :return: 返回当前对象 + """ + if headers is None: + self._headers = None + self._del_set.add('headers') + else: + headers = format_headers(headers) + self._headers = {key.lower(): headers[key] for key in headers} + return self + + def set_a_header(self, name, value): + """设置headers中一个项 + :param name: 设置名称 + :param value: 设置值 + :return: 返回当前对象 + """ + if self._headers is None: + self._headers = {} + + self._headers[name.lower()] = value + return self + + def remove_a_header(self, name): + """从headers中删除一个设置 + :param name: 要删除的设置 + :return: 返回当前对象 + """ + if self._headers is None: + return self + + self._headers.pop(name.lower(), None) + + return self + + def clear_headers(self): + """清空已设置的header参数""" + self._headers = None + self._del_set.add('headers') + + @property + def cookies(self): + """以list形式返回cookies""" + if self._cookies is None: + self._cookies = [] + return self._cookies + + def set_cookies(self, cookies): + """设置一个或多个cookies信息 + :param cookies: cookies,可为Cookie, CookieJar, list, tuple, str, dict,传入None可在ini文件标记删除 + :return: 返回当前对象 + """ + cookies = cookies if cookies is None else list(cookies_to_tuple(cookies)) + self._sets('cookies', cookies) + return self + + @property + def auth(self): + """返回认证设置信息""" + return self._auth + + def set_auth(self, auth): + """设置认证元组或对象 + :param auth: 认证元组或对象 + :return: 返回当前对象 + """ + self._sets('auth', auth) + return self + + @property + def hooks(self): + """返回回调方法""" + if self._hooks is None: + self._hooks = {} + return self._hooks + + def set_hooks(self, hooks): + """设置回调方法 + :param hooks: 回调方法 + :return: 返回当前对象 + """ + self._hooks = hooks + return self + + @property + def params(self): + """返回连接参数设置信息""" + if self._params is None: + self._params = {} + return self._params + + def set_params(self, params): + """设置查询参数字典 + :param params: 查询参数字典 + :return: 返回当前对象 + """ + self._sets('params', params) + return self + + @property + def verify(self): + """返回是否验证SSL证书设置""" + return self._verify + + def set_verify(self, on_off): + """设置是否验证SSL证书 + :param on_off: 是否验证 SSL 证书 + :return: 返回当前对象 + """ + self._sets('verify', on_off) + return self + + @property + def cert(self): + """返回SSL证书设置信息""" + return self._cert + + def set_cert(self, cert): + """SSL客户端证书文件的路径(.pem格式),或(‘cert’, ‘key’)元组 + :param cert: 证书路径或元组 + :return: 返回当前对象 + """ + self._sets('cert', cert) + return self + + @property + def adapters(self): + """返回适配器设置信息""" + if self._adapters is None: + self._adapters = [] + return self._adapters + + def add_adapter(self, url, adapter): + """添加适配器 + :param url: 适配器对应url + :param adapter: 适配器对象 + :return: 返回当前对象 + """ + self._adapters.append((url, adapter)) + return self + + @property + def stream(self): + """返回是否使用流式响应内容设置信息""" + return self._stream + + def set_stream(self, on_off): + """设置是否使用流式响应内容 + :param on_off: 是否使用流式响应内容 + :return: 返回当前对象 + """ + self._sets('stream', on_off) + return self + + @property + def trust_env(self): + """返回是否信任环境设置信息""" + return self._trust_env + + def set_trust_env(self, on_off): + """设置是否信任环境 + :param on_off: 是否信任环境 + :return: 返回当前对象 + """ + self._sets('trust_env', on_off) + return self + + @property + def max_redirects(self): + """返回最大重定向次数""" + return self._max_redirects + + def set_max_redirects(self, times): + """设置最大重定向次数 + :param times: 最大重定向次数 + :return: 返回当前对象 + """ + self._sets('max_redirects', times) + return self + + def _sets(self, arg, val): + """给属性赋值或标记删除 + :param arg: 属性名称 + :param val: 参数值 + :return: None + """ + if val is None: + self.__setattr__(f'_{arg}', None) + self._del_set.add(arg) + else: + self.__setattr__(f'_{arg}', val) + if arg in self._del_set: + self._del_set.remove(arg) + + def save(self, path=None): + """保存设置到文件 + :param path: ini文件的路径,传入 'default' 保存到默认ini文件 + :return: 保存文件的绝对路径 + """ + if path == 'default': + path = (Path(__file__).parent / 'configs.ini').absolute() + + elif path is None: + if self.ini_path: + path = Path(self.ini_path).absolute() + else: + path = (Path(__file__).parent / 'configs.ini').absolute() + + else: + path = Path(path).absolute() + + path = path / 'config.ini' if path.is_dir() else path + + if path.exists(): + om = OptionsManager(path) + else: + om = OptionsManager(self.ini_path or (Path(__file__).parent / 'configs.ini')) + + options = session_options_to_dict(self) + + for i in options: + if i not in ('download_path', 'timeout', 'proxies'): + om.set_item('session_options', i, options[i]) + + om.set_item('paths', 'download_path', self.download_path or '') + om.set_item('timeouts', 'base', self.timeout) + om.set_item('proxies', 'http', self.proxies.get('http', '')) + om.set_item('proxies', 'https', self.proxies.get('https', '')) + om.set_item('others', 'retry_times', self.retry_times) + om.set_item('others', 'retry_interval', self.retry_interval) + + for i in self._del_set: + if i == 'download_path': + om.set_item('paths', 'download_path', '') + elif i == 'proxies': + om.set_item('proxies', 'http', '') + om.set_item('proxies', 'https', '') + else: + om.remove_item('session_options', i) + + path = str(path) + om.save(path) + + return path + + def save_to_default(self): + """保存当前配置到默认ini文件""" + return self.save('default') + + def as_dict(self): + """以字典形式返回本对象""" + return session_options_to_dict(self) + + def make_session(self): + """根据内在的配置生成Session对象,headers从对象中分离""" + s = Session() + h = CaseInsensitiveDict(self.headers) if self.headers else CaseInsensitiveDict() + + if self.cookies: + set_session_cookies(s, self.cookies) + if self.adapters: + for url, adapter in self.adapters: + s.mount(url, adapter) + + for i in ['auth', 'proxies', 'hooks', 'params', 'verify', 'cert', 'stream', 'trust_env', 'max_redirects']: + attr = self.__getattribute__(i) + if attr: + s.__setattr__(i, attr) + + return s, h + + def from_session(self, session, headers=None): + """从Session对象中读取配置 + :param session: Session对象 + :param headers: headers + :return: 当前对象 + """ + self._headers = CaseInsensitiveDict(copy(session.headers).update(headers)) if headers else session.headers + self._cookies = session.cookies + self._auth = session.auth + self._proxies = session.proxies + self._hooks = session.hooks + self._params = session.params + self._verify = session.verify + self._cert = session.cert + self._stream = session.stream + self._trust_env = session.trust_env + self._max_redirects = session.max_redirects + if session.adapters: + self._adapters = [(k, i) for k, i in session.adapters.items()] + return self + + def __repr__(self): + return f'' + + +def session_options_to_dict(options): + """把session配置对象转换为字典 + :param options: session配置对象或字典 + :return: 配置字典 + """ + if options in (False, None): + return SessionOptions(read_file=False).as_dict() + + if isinstance(options, dict): + return options + + re_dict = dict() + attrs = ['headers', 'cookies', 'proxies', 'params', 'verify', 'stream', 'trust_env', 'cert', + 'max_redirects', 'timeout', 'download_path'] + + for attr in attrs: + val = options.__getattribute__(f'_{attr}') + if val is not None: + re_dict[attr] = val + + return re_dict diff --git a/src/flaresolverr/DrissionPage/_configs/session_options.pyi b/src/flaresolverr/DrissionPage/_configs/session_options.pyi new file mode 100644 index 0000000000..ae428b1646 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_configs/session_options.pyi @@ -0,0 +1,136 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from http.cookiejar import CookieJar, Cookie +from pathlib import Path +from typing import Any, Union, Tuple, Optional + +from requests import Session +from requests.adapters import HTTPAdapter +from requests.auth import HTTPBasicAuth +from requests.structures import CaseInsensitiveDict + + +class SessionOptions(object): + def __init__(self, read_file: [bool, None] = True, ini_path: Union[str, Path] = None): + self.ini_path: str = ... + self._download_path: str = ... + self._headers: dict = ... + self._cookies: list = ... + self._auth: tuple = ... + self._proxies: dict = ... + self._hooks: dict = ... + self._params: dict = ... + self._verify: bool = ... + self._cert: Union[str, tuple] = ... + self._adapters: list = ... + self._stream: bool = ... + self._trust_env: bool = ... + self._max_redirects: int = ... + self._timeout: float = ... + self._del_set: set = ... + self._retry_times: int = ... + self._retry_interval: float = ... + + @property + def download_path(self) -> str: ... + + def set_download_path(self, path: Union[str, Path]) -> SessionOptions: ... + + @property + def timeout(self) -> float: ... + + def set_timeout(self, second: float) -> SessionOptions: ... + + @property + def headers(self) -> dict: ... + + def set_headers(self, headers: Union[dict, str, None]) -> SessionOptions: ... + + def set_a_header(self, name: str, value: str) -> SessionOptions: ... + + def remove_a_header(self, name: str) -> SessionOptions: ... + + def clear_headers(self) -> SessionOptions: ... + + @property + def cookies(self) -> list: ... + + def set_cookies(self, cookies: Union[Cookie, CookieJar, list, tuple, str, dict, None]) -> SessionOptions: ... + + @property + def auth(self) -> Union[Tuple[str, str], HTTPBasicAuth]: ... + + def set_auth(self, auth: Union[Tuple[str, str], HTTPBasicAuth, None]) -> SessionOptions: ... + + @property + def proxies(self) -> dict: ... + + def set_proxies(self, http: Union[str, None], https: Union[str, None] = None) -> SessionOptions: ... + + @property + def retry_times(self) -> int: ... + + @property + def retry_interval(self) -> float: ... + + def set_retry(self, times: int = None, interval: float = None) -> SessionOptions: ... + + @property + def hooks(self) -> dict: ... + + def set_hooks(self, hooks: Union[dict, None]) -> SessionOptions: ... + + @property + def params(self) -> dict: ... + + def set_params(self, params: Union[dict, None]) -> SessionOptions: ... + + @property + def verify(self) -> bool: ... + + def set_verify(self, on_off: Union[bool, None]) -> SessionOptions: ... + + @property + def cert(self) -> Union[str, tuple]: ... + + def set_cert(self, cert: Union[str, Tuple[str, str], None]) -> SessionOptions: ... + + @property + def adapters(self): list: ... + + def add_adapter(self, url: str, adapter: HTTPAdapter) -> SessionOptions: ... + + @property + def stream(self) -> bool: ... + + def set_stream(self, on_off: Union[bool, None]) -> SessionOptions: ... + + @property + def trust_env(self) -> bool: ... + + def set_trust_env(self, on_off: Union[bool, None]) -> SessionOptions: ... + + @property + def max_redirects(self) -> int: ... + + def set_max_redirects(self, times: Union[int, None]) -> SessionOptions: ... + + def _sets(self, arg: str, val: Any) -> None: ... + + def save(self, path: str = None) -> str: ... + + def save_to_default(self) -> str: ... + + def as_dict(self) -> dict: ... + + def make_session(self) -> Tuple[Session, Optional[CaseInsensitiveDict]]: ... + + def from_session(self, session: Session, headers: CaseInsensitiveDict = None) -> SessionOptions: ... + + +def session_options_to_dict(options: Union[dict, SessionOptions, None]) -> Union[dict, None]: ... diff --git a/src/flaresolverr/DrissionPage/_elements/chromium_element.py b/src/flaresolverr/DrissionPage/_elements/chromium_element.py new file mode 100644 index 0000000000..cbf1b1c926 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_elements/chromium_element.py @@ -0,0 +1,1803 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from json import loads +from os.path import basename +from pathlib import Path +from re import search +from time import perf_counter, sleep + +from ...DataRecorder.tools import get_usable_path, make_valid_name + +from .none_element import NoneElement +from .session_element import make_session_ele +from .._base.base import DrissionElement, BaseElement +from .._functions.elements import ChromiumElementsList, SessionElementsList +from .._functions.keys import input_text_or_keys +from .._functions.locator import get_loc, locator_to_tuple +from .._functions.web import make_absolute_link, get_ele_txt, format_html, is_js_func, get_blob +from .._units.clicker import Clicker +from .._units.rect import ElementRect +from .._units.scroller import ElementScroller +from .._units.selector import SelectElement +from .._units.setter import ChromiumElementSetter +from .._units.states import ElementStates, ShadowRootStates +from .._units.waiter import ElementWaiter +from ..errors import ContextLostError, ElementLostError, JavaScriptError, CDPError, NoResourceError, AlertExistsError, \ + NoRectError + +__FRAME_ELEMENT__ = ('iframe', 'frame') + + +class ChromiumElement(DrissionElement): + """控制浏览器元素的对象""" + + def __init__(self, owner, node_id=None, obj_id=None, backend_id=None): + """node_id、obj_id和backend_id必须至少传入一个 + :param owner: 元素所在页面对象 + :param node_id: cdp中的node id + :param obj_id: js中的object id + :param backend_id: backend id + """ + super().__init__(owner) + self.tab = self.owner._tab + self._select = None + self._scroll = None + self._rect = None + self._set = None + self._states = None + self._pseudo = None + self._clicker = None + self._tag = None + self._wait = None + self._type = 'ChromiumElement' + self._doc_id = None + + if node_id and obj_id and backend_id: + self._node_id = node_id + self._obj_id = obj_id + self._backend_id = backend_id + elif node_id: + self._node_id = node_id + self._obj_id = self._get_obj_id(node_id) + self._backend_id = self._get_backend_id(self._node_id) + elif obj_id: + self._node_id = self._get_node_id(obj_id) + self._obj_id = obj_id + self._backend_id = self._get_backend_id(self._node_id) + elif backend_id: + self._obj_id = self._get_obj_id(backend_id=backend_id) + self._node_id = self._get_node_id(obj_id=self._obj_id) + self._backend_id = backend_id + else: + raise ElementLostError + + def __repr__(self): + attrs = [f"{k}='{v}'" for k, v in self.attrs.items()] + return f'' + + def __call__(self, locator, index=1, timeout=None): + """在内部查找元素 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 超时时间(秒) + :return: ChromiumElement对象或属性、文本 + """ + return self.ele(locator, index=index, timeout=timeout) + + def __eq__(self, other): + return self._backend_id == getattr(other, '_backend_id', None) + + @property + def tag(self): + """返回元素tag""" + if self._tag is None: + self._tag = self.owner._run_cdp('DOM.describeNode', + backendNodeId=self._backend_id)['node']['localName'].lower() + return self._tag + + @property + def html(self): + """返回元素outerHTML文本""" + return self.owner._run_cdp('DOM.getOuterHTML', backendNodeId=self._backend_id)['outerHTML'] + + @property + def inner_html(self): + """返回元素innerHTML文本""" + return self._run_js('return this.innerHTML;') + + @property + def attrs(self): + """返回元素所有attribute属性""" + try: + attrs = self.owner._run_cdp('DOM.getAttributes', nodeId=self._node_id)['attributes'] + return {attrs[i]: attrs[i + 1] for i in range(0, len(attrs), 2)} + except ElementLostError: + self._refresh_id() + attrs = self.owner._run_cdp('DOM.getAttributes', nodeId=self._node_id)['attributes'] + return {attrs[i]: attrs[i + 1] for i in range(0, len(attrs), 2)} + except CDPError: # 文档根元素不能调用此方法 + return {} + + @property + def text(self): + """返回元素内所有文本,文本已格式化""" + return get_ele_txt(make_session_ele(self.html)) + + @property + def raw_text(self): + """返回未格式化处理的元素内文本""" + return self.property('innerText') + + # -----------------d模式独有属性------------------- + @property + def set(self): + """返回用于设置元素属性的对象""" + if self._set is None: + self._set = ChromiumElementSetter(self) + return self._set + + @property + def states(self): + """返回用于获取元素状态的对象""" + if self._states is None: + self._states = ElementStates(self) + return self._states + + @property + def pseudo(self): + """返回用于获取伪元素内容的对象""" + if self._pseudo is None: + self._pseudo = Pseudo(self) + return self._pseudo + + @property + def rect(self): + """返回用于获取元素位置的对象""" + if self._rect is None: + self._rect = ElementRect(self) + return self._rect + + @property + def sr(self): + """返回当前元素的shadow_root元素对象""" + end_time = perf_counter() + self.owner.timeout + while perf_counter() < end_time: + info = self.owner._run_cdp('DOM.describeNode', backendNodeId=self._backend_id)['node'] + if info.get('shadowRoots', None): + return ShadowRoot(self, backend_id=info['shadowRoots'][0]['backendNodeId']) + return None + + @property + def shadow_root(self): + """返回当前元素的shadow_root元素对象""" + return self.sr + + @property + def scroll(self): + """用于滚动滚动条的对象""" + if self._scroll is None: + self._scroll = ElementScroller(self) + return self._scroll + + @property + def click(self): + """返回用于点击的对象""" + if self._clicker is None: + self._clicker = Clicker(self) + return self._clicker + + @property + def wait(self): + """返回用于等待的对象""" + if self._wait is None: + self._wait = ElementWaiter(self) + return self._wait + + @property + def select(self): + """返回专门处理下拉列表的Select类,非下拉列表元素返回False""" + if self._select is None: + if self.tag != 'select': + self._select = False + else: + self._select = SelectElement(self) + + return self._select + + @property + def value(self): + return self.property('value') + + def check(self, uncheck=False, by_js=False): + """选中或取消选中当前元素 + :param uncheck: 是否取消选中 + :param by_js: 是否用js执行 + :return: None + """ + is_checked = self.states.is_checked + if by_js: + js = None + if is_checked and uncheck: + js = 'this.checked=false' + elif not is_checked and not uncheck: + js = 'this.checked=true' + if js: + self._run_js(js) + self._run_js('this.dispatchEvent(new Event("change", {bubbles: true}));') + + else: + if (is_checked and uncheck) or (not is_checked and not uncheck): + self.click() + + def parent(self, level_or_loc=1, index=1, timeout=0): + """返回上面某一级父元素,可指定层数或用查询语法定位 + :param level_or_loc: 第几级父元素,1开始,或定位符 + :param index: 当level_or_loc传入定位符,使用此参数选择第几个结果,1开始 + :param timeout: 查找超时时间(秒) + :return: 上级元素对象 + """ + return super().parent(level_or_loc, index, timeout=timeout) + + def child(self, locator='', index=1, timeout=None, ele_only=True): + """返回当前元素的一个符合条件的直接子元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 直接子元素或节点文本 + """ + return super().child(locator, index, timeout, ele_only=ele_only) + + def prev(self, locator='', index=1, timeout=None, ele_only=True): + """返回当前元素前面一个符合条件的同级元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素或节点文本 + """ + return super().prev(locator, index, timeout, ele_only=ele_only) + + def next(self, locator='', index=1, timeout=None, ele_only=True): + """返回当前元素后面一个符合条件的同级元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素或节点文本 + """ + return super().next(locator, index, timeout, ele_only=ele_only) + + def before(self, locator='', index=1, timeout=None, ele_only=True): + """返回文档中当前元素前面符合条件的一个元素,可用查询语法筛选,可指定返回筛选结果的第几个 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的某个元素或节点 + """ + return super().before(locator, index, timeout, ele_only=ele_only) + + def after(self, locator='', index=1, timeout=None, ele_only=True): + """返回文档中此当前元素后面符合条件的一个元素,可用查询语法筛选,可指定返回筛选结果的第几个 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素后面的某个元素或节点 + """ + return super().after(locator, index, timeout, ele_only=ele_only) + + def children(self, locator='', timeout=None, ele_only=True): + """返回当前元素符合条件的直接子元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 直接子元素或节点文本组成的列表 + """ + return ChromiumElementsList(self.owner, super().children(locator, timeout, ele_only=ele_only)) + + def prevs(self, locator='', timeout=None, ele_only=True): + """返回当前元素前面符合条件的同级元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素或节点文本组成的列表 + """ + return ChromiumElementsList(self.owner, super().prevs(locator, timeout, ele_only=ele_only)) + + def nexts(self, locator='', timeout=None, ele_only=True): + """返回当前元素后面符合条件的同级元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 兄弟元素或节点文本组成的列表 + """ + return ChromiumElementsList(self.owner, super().nexts(locator, timeout, ele_only=ele_only)) + + def befores(self, locator='', timeout=None, ele_only=True): + """返回文档中当前元素前面符合条件的元素或节点组成的列表,可用查询语法筛选 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的元素或节点组成的列表 + """ + return ChromiumElementsList(self.owner, super().befores(locator, timeout, ele_only=ele_only)) + + def afters(self, locator='', timeout=None, ele_only=True): + """返回文档中当前元素后面符合条件的元素或节点组成的列表,可用查询语法筛选 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param timeout: 查找节点的超时时间(秒) + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素后面的元素或节点组成的列表 + """ + return ChromiumElementsList(self.owner, super().afters(locator, timeout, ele_only=ele_only)) + + def over(self, timeout=None): + """获取覆盖在本元素上最上层的元素 + :param timeout: 等待元素出现的超时时间(秒) + :return: 元素对象 + """ + timeout = timeout if timeout is None else self.owner.timeout + bid = self.wait.covered(timeout=timeout) + if bid: + return ChromiumElement(owner=self.owner, backend_id=bid) + else: + return NoneElement(page=self.owner, method='on()', args={'timeout': timeout}) + + def offset(self, locator=None, x=None, y=None, timeout=None): + """获取相对本元素左上角左边指定偏移量位置的元素,如果offset_x和offset_y都是None,定位到元素中间点 + :param locator: 定位符,只支持str,且不支持xpath和css方式 + :param x: 横坐标偏移量,向右为正 + :param y: 纵坐标偏移量,向下为正 + :param timeout: 超时时间(秒),为None使用所在页面设置 + :return: 元素对象 + """ + if locator and not (isinstance(locator, str) and not locator.startswith( + ('x:', 'xpath:', 'x=', 'xpath=', 'c:', 'css:', 'c=', 'css='))): + raise ValueError('locator参数只能是str格式且不支持xpath和css形式。') + + if x == y is None: + x, y = self.rect.midpoint + x = int(x) + y = int(y) + else: + nx, ny = self.rect.location + nx += x if x else 0 + ny += y if y else 0 + x = int(nx) + y = int(ny) + loc_data = locator_to_tuple(locator) if locator else None + timeout = timeout if timeout is not None else self.owner.timeout + end_time = perf_counter() + timeout + try: + ele = ChromiumElement(owner=self.owner, + backend_id=self.owner._run_cdp('DOM.getNodeForLocation', x=x, y=y, + includeUserAgentShadowDOM=True, + ignorePointerEventsNone=False)['backendNodeId']) + except CDPError: + ele = False + if ele and (loc_data is None or _check_ele(ele, loc_data)): + return ele + + while perf_counter() < end_time: + try: + ele = ChromiumElement(owner=self.owner, + backend_id=self.owner._run_cdp('DOM.getNodeForLocation', x=x, y=y, + includeUserAgentShadowDOM=True, + ignorePointerEventsNone=False)['backendNodeId']) + except CDPError: + ele = False + + if ele and (loc_data is None or _check_ele(ele, loc_data)): + return ele + sleep(.1) + + return NoneElement(page=self.owner, method='offset()', + args={'locator': locator, 'offset_x': x, 'offset_y': y, 'timeout': timeout}) + + def east(self, loc_or_pixel=None, index=1): + """获取元素右边某个指定元素 + :param loc_or_pixel: 定位符,只支持str或int,且不支持xpath和css方式,传入int按像素距离获取 + :param index: 第几个,从1开始 + :return: 获取到的元素对象 + """ + return self._get_relative_eles(mode='east', locator=loc_or_pixel, index=index) + + def south(self, loc_or_pixel=None, index=1): + """获取元素下方某个指定元素 + :param loc_or_pixel: 定位符,只支持str或int,且不支持xpath和css方式,传入int按像素距离获取 + :param index: 第几个,从1开始 + :return: 获取到的元素对象 + """ + return self._get_relative_eles(mode='south', locator=loc_or_pixel, index=index) + + def west(self, loc_or_pixel=None, index=1): + """获取元素左边某个指定元素 + :param loc_or_pixel: 定位符,只支持str或int,且不支持xpath和css方式,传入int按像素距离获取 + :param index: 第几个,从1开始 + :return: 获取到的元素对象 + """ + return self._get_relative_eles(mode='west', locator=loc_or_pixel, index=index) + + def north(self, loc_or_pixel=None, index=1): + """获取元素上方某个指定元素 + :param loc_or_pixel: 定位符,只支持str或int,且不支持xpath和css方式,传入int按像素距离获取 + :param index: 第几个,从1开始 + :return: 获取到的元素对象 + """ + return self._get_relative_eles(mode='north', locator=loc_or_pixel, index=index) + + def _get_relative_eles(self, mode='north', locator=None, index=1): + """获取元素下方某个指定元素 + :param locator: 定位符,只支持str或int,且不支持xpath和css方式 + :param index: 第几个,从1开始 + :return: 获取到的元素对象 + """ + if locator and not (isinstance(locator, str) and not locator.startswith( + ('x:', 'xpath:', 'x=', 'xpath=', 'c:', 'css:', 'c=', 'css=')) or isinstance(locator, int)): + raise ValueError('locator参数只能是str格式且不支持xpath和css形式。') + rect = self.states.has_rect + if not rect: + raise NoRectError + + if mode == 'east': + cdp_data = {'x': int(rect[1][0]), 'y': int(self.rect.midpoint[1]), + 'includeUserAgentShadowDOM': True, 'ignorePointerEventsNone': False} + variable = 'x' + minus = False + elif mode == 'south': + cdp_data = {'x': int(self.rect.midpoint[0]), 'y': int(rect[2][1]), + 'includeUserAgentShadowDOM': True, 'ignorePointerEventsNone': False} + variable = 'y' + minus = False + elif mode == 'west': + cdp_data = {'x': int(rect[0][0]), 'y': int(self.rect.midpoint[1]), + 'includeUserAgentShadowDOM': True, 'ignorePointerEventsNone': False} + variable = 'x' + minus = True + else: # north + cdp_data = {'x': int(self.rect.midpoint[0]), 'y': int(rect[0][1]), + 'includeUserAgentShadowDOM': True, 'ignorePointerEventsNone': False} + variable = 'y' + minus = True + + if isinstance(locator, int): + if minus: + cdp_data[variable] -= locator + else: + cdp_data[variable] += locator + try: + return ChromiumElement(owner=self.owner, + backend_id=self.owner._run_cdp('DOM.getNodeForLocation', + **cdp_data)['backendNodeId']) + except CDPError: + return NoneElement(page=self.owner, method=f'{mode}()', args={'locator': locator}) + + num = 0 + value = -8 if minus else 8 + size = self.owner.rect.size + max_len = size[0] if mode == 'east' else size[1] + loc_data = locator_to_tuple(locator) if locator else None + curr_ele = None + while 0 < cdp_data[variable] < max_len: + cdp_data[variable] += value + try: + bid = self.owner._run_cdp('DOM.getNodeForLocation', **cdp_data)['backendNodeId'] + if bid == curr_ele: + continue + else: + curr_ele = bid + ele = ChromiumElement(self.owner, backend_id=bid) + + if loc_data is None or _check_ele(ele, loc_data): + num += 1 + if num == index: + return ele + except: + pass + + return NoneElement(page=self.owner, method=f'{mode}()', args={'locator': locator}) + + def attr(self, attr): + """返回一个attribute属性值 + :param attr: 属性名 + :return: 属性值文本,没有该属性返回None + """ + attrs = self.attrs + if attr == 'href': # 获取href属性时返回绝对url + link = attrs.get('href') + if not link or link.lower().startswith(('javascript:', 'mailto:')): + return link + else: + return make_absolute_link(link, self.property('baseURI')) + + elif attr == 'src': + return make_absolute_link(attrs.get('src'), self.property('baseURI')) + + elif attr == 'text': + return self.text + + elif attr == 'innerText': + return self.raw_text + + elif attr in ('html', 'outerHTML'): + return self.html + + elif attr == 'innerHTML': + return self.inner_html + + else: + return attrs.get(attr, None) + + def remove_attr(self, name): + """删除元素一个attribute属性 + :param name: 属性名 + :return: None + """ + self._run_js(f'this.removeAttribute("{name}");') + + def property(self, name): + """获取一个property属性值 + :param name: 属性名 + :return: 属性值文本 + """ + try: + value = self._run_js(f'return this.{name};') + return format_html(value) if isinstance(value, str) else value + except: + return None + + def run_js(self, script, *args, as_expr=False, timeout=None): + """对本元素执行javascript代码 + :param script: js文本,文本中用this表示本元素 + :param args: 参数,按顺序在js文本中对应arguments[0]、arguments[1]... + :param as_expr: 是否作为表达式运行,为True时args无效 + :param timeout: js超时时间(秒),为None则使用页面timeouts.script设置 + :return: 运行的结果 + """ + return self._run_js(script, *args, as_expr=as_expr, timeout=timeout) + + def _run_js(self, script, *args, as_expr=False, timeout=None): + """对本元素执行javascript代码 + :param script: js文本,文本中用this表示本元素 + :param args: 参数,按顺序在js文本中对应arguments[0]、arguments[1]... + :param as_expr: 是否作为表达式运行,为True时args无效 + :param timeout: js超时时间(秒),为None则使用页面timeouts.script设置 + :return: 运行的结果 + """ + return run_js(self, script, as_expr, self.owner.timeouts.script if timeout is None else timeout, args) + + def run_async_js(self, script, *args, as_expr=False): + """以异步方式对本元素执行javascript代码 + :param script: js文本,文本中用this表示本元素 + :param args: 参数,按顺序在js文本中对应arguments[0]、arguments[1]... + :param as_expr: 是否作为表达式运行,为True时args无效 + :return: None + """ + run_js(self, script, as_expr, 0, args) + + def ele(self, locator, index=1, timeout=None): + """返回当前元素下级符合条件的一个元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 获取第几个元素,从1开始,可传入负数获取倒数第几个 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: ChromiumElement对象或属性、文本 + """ + return self._ele(locator, timeout, index=index, method='ele()') + + def eles(self, locator, timeout=None): + """返回当前元素下级所有符合条件的子元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: ChromiumElement对象或属性、文本组成的列表 + """ + return self._ele(locator, timeout=timeout, index=None) + + def s_ele(self, locator=None, index=1, timeout=None): + """查找一个符合条件的元素,以SessionElement形式返回 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 获取第几个,从1开始,可传入负数获取倒数第几个 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: SessionElement对象或属性、文本 + """ + return (make_session_ele(self, locator, index=index, method='s_ele()') + if self.ele(locator, index=index, timeout=timeout) + else NoneElement(self, method='s_ele()', args={'locator': locator, 'index': index})) + + def s_eles(self, locator=None, timeout=None): + """查找所有符合条件的元素,以SessionElement列表形式返回 + :param locator: 定位符 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: SessionElement或属性、文本组成的列表 + """ + return (make_session_ele(self, locator, index=None) + if self.ele(locator, timeout=timeout) else SessionElementsList()) + + def _find_elements(self, locator, timeout=None, index=1, relative=False, raise_err=None): + """返回当前元素下级符合条件的子元素、属性或节点文本,默认返回第一个 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 查找元素超时时间(秒) + :param index: 第几个结果,从1开始,可传入负数获取倒数第几个,为None返回所有 + :param relative: MixTab用的表示是否相对定位的参数 + :param raise_err: 找不到元素是是否抛出异常,为None时根据全局设置 + :return: ChromiumElement对象或文本、属性或其组成的列表 + """ + return find_in_chromium_ele(self, locator, index, timeout, relative=relative) + + def style(self, style, pseudo_ele=''): + """返回元素样式属性值,可获取伪元素属性值 + :param style: 样式属性名称 + :param pseudo_ele: 伪元素名称(如有) + :return: 样式属性的值 + """ + if pseudo_ele: + pseudo_ele = f', "{pseudo_ele}"' if pseudo_ele.startswith(':') else f', "::{pseudo_ele}"' + return self._run_js(f'return window.getComputedStyle(this{pseudo_ele}).getPropertyValue("{style}");') + + def src(self, timeout=None, base64_to_bytes=True): + """返回元素src资源,base64的可转为bytes返回,其它返回str + :param timeout: 等待资源加载的超时时间(秒) + :param base64_to_bytes: 为True时,如果是base64数据,转换为bytes格式 + :return: 资源内容 + """ + timeout = self.owner.timeout if timeout is None else timeout + if self.tag == 'img': # 等待图片加载完成 + js = ('return this.complete && typeof this.naturalWidth != "undefined" ' + '&& this.naturalWidth > 0 && typeof this.naturalHeight != "undefined" ' + '&& this.naturalHeight > 0') + end_time = perf_counter() + timeout + while not self._run_js(js) and perf_counter() < end_time: + sleep(.1) + + src = self.attr('src') + if not src: + raise RuntimeError('元素没有src值或该值为空。') + if src.lower().startswith('data:image'): + if base64_to_bytes: + from base64 import b64decode + return b64decode(src.split(',', 1)[-1]) + else: + return src.split(',', 1)[-1] + + is_blob = src.startswith('blob') + result = None + end_time = perf_counter() + timeout + if is_blob: + while perf_counter() < end_time: + result = get_blob(self.owner, src, base64_to_bytes) + if result: + break + sleep(.05) + + else: + while perf_counter() < end_time: + src = self.property('currentSrc') + if not src: + continue + + node = self.owner._run_cdp('DOM.describeNode', backendNodeId=self._backend_id)['node'] + frame = node.get('frameId', None) or self.owner._frame_id + + try: + result = self.owner._run_cdp('Page.getResourceContent', frameId=frame, url=src) + break + except CDPError: + pass + sleep(.1) + + if not result: + return None + + elif is_blob: + return result + + elif result['base64Encoded'] and base64_to_bytes: + from base64 import b64decode + return b64decode(result['content']) + else: + return result['content'] + + def save(self, path=None, name=None, timeout=None, rename=True): + """保存图片或其它有src属性的元素的资源 + :param path: 文件保存路径,为None时保存到当前文件夹 + :param name: 文件名称,为None时从资源url获取 + :param timeout: 等待资源加载的超时时间(秒) + :param rename: 遇到重名文件时是否自动重命名 + :return: 返回保存路径 + """ + data = self.src(timeout=timeout) + if not data: + raise NoResourceError + + path = path or '.' + if not name and self.tag == 'img': + src = self.attr('src') + if src.lower().startswith('data:image'): + r = search(r'data:image/(.*?);base64,', src) + name = f'img.{r.group(1)}' if r else None + path = Path(path) / make_valid_name(name or basename(self.property('currentSrc'))) + if not path.suffix: + path = path.with_suffix('.jpg') + if rename: + path = get_usable_path(path) + path.parent.mkdir(parents=True, exist_ok=True) + path = path.absolute() + write_type = 'wb' if isinstance(data, bytes) else 'w' + + with open(path, write_type) as f: + f.write(data) + + return str(path) + + def get_screenshot(self, path=None, name=None, as_bytes=None, as_base64=None, scroll_to_center=True): + """对当前元素截图,可保存到文件,或以字节方式返回 + :param path: 文件保存路径 + :param name: 完整文件名,后缀可选 'jpg','jpeg','png','webp' + :param as_bytes: 是否以字节形式返回图片,可选 'jpg','jpeg','png','webp',生效时path参数和as_base64参数无效 + :param as_base64: 是否以base64字符串形式返回图片,可选 'jpg','jpeg','png','webp',生效时path参数无效 + :param scroll_to_center: 截图前是否滚动到视口中央 + :return: 图片完整路径或字节文本 + """ + if self.tag == 'img': # 等待图片加载完成 + js = ('return this.complete && typeof this.naturalWidth != "undefined" && this.naturalWidth > 0 ' + '&& typeof this.naturalHeight != "undefined" && this.naturalHeight > 0') + end_time = perf_counter() + self.owner.timeout + while not self._run_js(js) and perf_counter() < end_time: + sleep(.1) + if scroll_to_center: + self.scroll.to_see(center=True) + + left, top = self.rect.location + width, height = self.rect.size + left_top = (left, top) + right_bottom = (left + width, top + height) + if not name: + name = f'{self.tag}.jpg' + + return self.owner._get_screenshot(path, name, as_bytes=as_bytes, as_base64=as_base64, full_page=False, + left_top=left_top, right_bottom=right_bottom, ele=self) + + def input(self, vals, clear=False, by_js=False): + """输入文本或组合键,也可用于输入文件路径到input元素(路径间用\n间隔) + :param vals: 文本值或按键组合 + :param clear: 输入前是否清空文本框 + :param by_js: 是否用js方式输入,不能输入组合键 + :return: None + """ + if self.tag == 'input' and self.attr('type') == 'file': + return self._set_file_input(vals) + + if by_js: + if clear: + self.clear(True) + if isinstance(vals, (list, tuple)): + vals = ''.join([str(i) for i in vals]) + self.set.property('value', str(vals)) + self._run_js('this.dispatchEvent(new Event("change", {bubbles: true}));') + return + + self.wait.clickable(wait_moved=False, timeout=.5) + if clear and vals not in ('\n', '\ue007'): + self.clear(by_js=False) + else: + self._input_focus() + + if isinstance(vals, str) and vals not in ('\ue003', '\ue017', '\ue010', '\ue011', + '\ue012', '\ue013', '\ue014', '\ue015',): + input_text_or_keys(self.owner, vals) + else: + self.owner.actions.type(vals) + + def clear(self, by_js=False): + """清空元素文本 + :param by_js: 是否用js方式清空,为False则用全选+del模拟输入删除 + :return: None + """ + if by_js: + self._run_js("this.value='';") + self._run_js('this.dispatchEvent(new Event("change", {bubbles: true}));') + return + + self._input_focus() + self.input(('\ue009', 'a', '\ue017'), clear=False) + + def _input_focus(self): + """输入前使元素获取焦点""" + try: + self.owner._run_cdp('DOM.focus', backendNodeId=self._backend_id) + except Exception: + self.click(by_js=None) + + def focus(self): + """使元素获取焦点""" + try: + self.owner._run_cdp('DOM.focus', backendNodeId=self._backend_id) + except Exception: + self._run_js('this.focus();') + + def hover(self, offset_x=None, offset_y=None): + """鼠标悬停,可接受偏移量,偏移量相对于元素左上角坐标。不传入offset_x和offset_y值时悬停在元素中点 + :param offset_x: 相对元素左上角坐标的x轴偏移量 + :param offset_y: 相对元素左上角坐标的y轴偏移量 + :return: None + """ + self.owner.actions.move_to(self, offset_x=offset_x, offset_y=offset_y, duration=.1) + + def drag(self, offset_x=0, offset_y=0, duration=.5): + """拖拽当前元素到相对位置 + :param offset_x: x变化值 + :param offset_y: y变化值 + :param duration: 拖动用时,传入0即瞬间到达 + :return: None + """ + curr_x, curr_y = self.rect.midpoint + offset_x += curr_x + offset_y += curr_y + self.drag_to((offset_x, offset_y), duration) + + def drag_to(self, ele_or_loc, duration=.5): + """拖拽当前元素,目标为另一个元素或坐标元组(x, y) + :param ele_or_loc: 另一个元素或坐标元组,坐标为元素中点的坐标 + :param duration: 拖动用时,传入0即瞬间到达 + :return: None + """ + if isinstance(ele_or_loc, ChromiumElement): + ele_or_loc = ele_or_loc.rect.midpoint + elif not isinstance(ele_or_loc, (list, tuple)): + raise TypeError('需要ChromiumElement对象或坐标。') + self.owner.actions.hold(self).move_to(ele_or_loc, duration=duration).release() + + def _get_obj_id(self, node_id=None, backend_id=None): + """根据传入node id或backend id获取js中的object id + :param node_id: cdp中的node id + :param backend_id: backend id + :return: js中的object id + """ + if node_id: + return self.owner._run_cdp('DOM.resolveNode', nodeId=node_id)['object']['objectId'] + else: + return self.owner._run_cdp('DOM.resolveNode', backendNodeId=backend_id)['object']['objectId'] + + def _get_node_id(self, obj_id=None, backend_id=None): + """根据传入object id或backend id获取cdp中的node id + :param obj_id: js中的object id + :param backend_id: backend id + :return: cdp中的node id + """ + if obj_id: + return self.owner._run_cdp('DOM.requestNode', objectId=obj_id)['nodeId'] + else: + n = self.owner._run_cdp('DOM.describeNode', backendNodeId=backend_id)['node'] + self._tag = n['localName'] + return n['nodeId'] + + def _get_backend_id(self, node_id): + """根据传入node id获取backend id + :param node_id: + :return: backend id + """ + n = self.owner._run_cdp('DOM.describeNode', nodeId=node_id)['node'] + self._tag = n['localName'] + return n['backendNodeId'] + + def _refresh_id(self): + """根据backend id刷新其它id""" + self._obj_id = self._get_obj_id(backend_id=self._backend_id) + self._node_id = self._get_node_id(obj_id=self._obj_id) + + def _get_ele_path(self, mode): + """返获取绝对的css路径或xpath路径""" + if mode == 'xpath': + txt1 = 'let tag = el.nodeName.toLowerCase();' + txt3 = ''' && sib.nodeName.toLowerCase()==tag''' + txt4 = ''' + if(nth>1){path = '/' + tag + '[' + nth + ']' + path;} + else{path = '/' + tag + path;}''' + txt5 = '''return path;''' + + elif mode == 'css': + txt1 = ''' + let i = el.getAttribute("id"); + if (i){path = '>' + el.tagName.toLowerCase() + "#" + i + path; + break;} + ''' + txt3 = '' + txt4 = '''path = '>' + el.tagName.toLowerCase() + ":nth-child(" + nth + ")" + path;''' + txt5 = '''return path.substr(1);''' + + else: + raise ValueError(f"mode参数只能是'xpath'或'css',现在是:'{mode}'。") + + js = '''function(){ + function e(el) { + //return el; + if (!(el instanceof Element)) return; + let path = ''; + while (el.nodeType === Node.ELEMENT_NODE) { + ''' + txt1 + ''' + let sib = el, nth = 0; + while (sib) { + if(sib.nodeType === Node.ELEMENT_NODE''' + txt3 + '''){nth += 1;} + sib = sib.previousSibling; + } + ''' + txt4 + ''' + el = el.parentNode; + } + ''' + txt5 + ''' + } + return e(this);} + ''' + t = self._run_js(js) + return f'{t}' if mode == 'css' else t + + def _set_file_input(self, files): + """对上传控件写入路径 + :param files: 文件路径列表或字符串,字符串时多个文件用回车分隔 + :return: None + """ + if isinstance(files, str): + files = files.split('\n') + files = [str(Path(i).absolute()) for i in files] + self.owner._run_cdp('DOM.setFileInputFiles', files=files, backendNodeId=self._backend_id) + + +class ShadowRoot(BaseElement): + """ShadowRoot是用于处理ShadowRoot的类,使用方法和ChromiumElement基本一致""" + + def __init__(self, parent_ele, obj_id=None, backend_id=None): + """ + :param parent_ele: shadow root 所在父元素 + :param obj_id: js中的object id + :param backend_id: cdp中的backend id + """ + super().__init__(parent_ele.owner) + self.tab = self.owner._tab + self.parent_ele = parent_ele + if backend_id: + self._backend_id = backend_id + self._obj_id = self._get_obj_id(backend_id) + self._node_id = self._get_node_id(self._obj_id) + elif obj_id: + self._obj_id = obj_id + self._node_id = self._get_node_id(obj_id) + self._backend_id = self._get_backend_id(self._node_id) + self._states = None + self._type = 'ShadowRoot' + + def __repr__(self): + return f'' + + def __call__(self, locator, index=1, timeout=None): + """在内部查找元素 + 例:ele2 = ele1('@id=ele_id') + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 获取第几个,从1开始,可传入负数获取倒数第几个 + :param timeout: 超时时间(秒) + :return: 元素对象或属性、文本 + """ + return self.ele(locator, index=index, timeout=timeout) + + def __eq__(self, other): + return self._backend_id == getattr(other, '_backend_id', None) + + @property + def tag(self): + """返回元素标签名""" + return 'shadow-root' + + @property + def html(self): + """返回outerHTML文本""" + return f'{self.inner_html}' + + @property + def inner_html(self): + """返回内部的html文本""" + return self._run_js('return this.innerHTML;') + + @property + def states(self): + """返回用于获取元素状态的对象""" + if self._states is None: + self._states = ShadowRootStates(self) + return self._states + + def run_js(self, script, *args, as_expr=False, timeout=None): + """运行javascript代码 + :param script: js文本 + :param args: 参数,按顺序在js文本中对应arguments[0]、arguments[1]... + :param as_expr: 是否作为表达式运行,为True时args无效 + :param timeout: js超时时间(秒),为None则使用页面timeouts.script设置 + :return: 运行的结果 + """ + return self._run_js(script, *args, as_expr=as_expr, timeout=timeout) + + def _run_js(self, script, *args, as_expr=False, timeout=None): + """运行javascript代码 + :param script: js文本 + :param args: 参数,按顺序在js文本中对应arguments[0]、arguments[1]... + :param as_expr: 是否作为表达式运行,为True时args无效 + :param timeout: js超时时间(秒),为None则使用页面timeouts.script设置 + :return: 运行的结果 + """ + return run_js(self, script, as_expr, self.owner.timeouts.script if timeout is None else timeout, args) + + def run_async_js(self, script, *args, as_expr=False, timeout=None): + """以异步方式执行js代码 + :param script: js文本 + :param args: 参数,按顺序在js文本中对应arguments[0]、arguments[1]... + :param as_expr: 是否作为表达式运行,为True时args无效 + :param timeout: js超时时间(秒),为None则使用页面timeouts.script设置 + :return: None + """ + from threading import Thread + Thread(target=run_js, args=(self, script, as_expr, + self.owner.timeouts.script if timeout is None else timeout, args)).start() + + def parent(self, level_or_loc=1, index=1, timeout=0): + """返回上面某一级父元素,可指定层数或用查询语法定位 + :param level_or_loc: 第几级父元素,或定位符 + :param index: 当level_or_loc传入定位符,使用此参数选择第几个结果 + :param timeout: 查找超时时间(秒) + :return: ChromiumElement对象 + """ + if isinstance(level_or_loc, int): + loc = f'xpath:./ancestor-or-self::*[{level_or_loc}]' + + elif isinstance(level_or_loc, (tuple, str)): + loc = get_loc(level_or_loc, True) + + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + + loc = f'xpath:./ancestor-or-self::{loc[1].lstrip(". / ")}[{index}]' + + else: + raise TypeError('level_or_loc参数只能是tuple、int或str。') + + return self.parent_ele._ele(loc, timeout=timeout, relative=True, raise_err=False, method='parent()') + + def child(self, locator='', index=1, timeout=None): + """返回直接子元素元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 查找超时时间(秒) + :return: 直接子元素或节点文本组成的列表 + """ + if not locator: + loc = '*' + else: + loc = get_loc(locator, True) # 把定位符转换为xpath + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + loc = loc[1].lstrip('./') + + loc = f'xpath:./{loc}' + ele = self._ele(loc, index=index, relative=True, timeout=timeout) + + return ele if ele else NoneElement(self.owner, 'child()', + {'locator': locator, 'index': index, 'timeout': timeout}) + + def next(self, locator='', index=1, timeout=None): + """返回当前元素后面一个符合条件的同级元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 查找超时时间(秒) + :return: ChromiumElement对象 + """ + loc = get_loc(locator, True) + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + + loc = loc[1].lstrip('./') + xpath = f'xpath:./{loc}' + ele = self.parent_ele._ele(xpath, index=index, relative=True, timeout=timeout) + + return ele if ele else NoneElement(self.owner, 'next()', + {'locator': locator, 'index': index, 'timeout': timeout}) + + def before(self, locator='', index=1, timeout=None): + """返回文档中当前元素前面符合条件的一个元素,可用查询语法筛选,可指定返回筛选结果的第几个 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 查找超时时间(秒) + :return: 本元素前面的某个元素或节点 + """ + loc = get_loc(locator, True) + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + + loc = loc[1].lstrip('./') + xpath = f'xpath:./preceding::{loc}' + ele = self.parent_ele._ele(xpath, index=index, relative=True, timeout=timeout) + + return ele if ele else NoneElement(self.owner, 'before()', + {'locator': locator, 'index': index, 'timeout': timeout}) + + def after(self, locator='', index=1, timeout=None): + """返回文档中此当前元素后面符合条件的一个元素,可用查询语法筛选,可指定返回筛选结果的第几个 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param index: 后面第几个查询结果,1开始 + :param timeout: 查找超时时间(秒) + :return: 本元素后面的某个元素或节点 + """ + nodes = self.afters(locator=locator, timeout=timeout) + return nodes[index - 1] if nodes else NoneElement(self.owner, 'after()', + {'locator': locator, 'index': index, 'timeout': timeout}) + + def children(self, locator='', timeout=None): + """返回当前元素符合条件的直接子元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找超时时间(秒) + :return: 直接子元素或节点文本组成的列表 + """ + if not locator: + loc = '*' + else: + loc = get_loc(locator, True) # 把定位符转换为xpath + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + loc = loc[1].lstrip('./') + + loc = f'xpath:./{loc}' + return self._ele(loc, index=None, relative=True, timeout=timeout) + + def nexts(self, locator='', timeout=None): + """返回当前元素后面符合条件的同级元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 查找超时时间(秒) + :return: ChromiumElement对象组成的列表 + """ + loc = get_loc(locator, True) + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + + loc = loc[1].lstrip('./') + xpath = f'xpath:./{loc}' + return self.parent_ele._ele(xpath, index=None, relative=True, timeout=timeout) + + def befores(self, locator='', timeout=None): + """返回文档中当前元素前面符合条件的元素或节点组成的列表,可用查询语法筛选 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param timeout: 查找超时时间(秒) + :return: 本元素前面的元素或节点组成的列表 + """ + loc = get_loc(locator, True) + if loc[0] == 'css selector': + raise ValueError('此css selector语法不受支持,请换成xpath。') + + loc = loc[1].lstrip('./') + xpath = f'xpath:./preceding::{loc}' + return self.parent_ele._ele(xpath, index=None, relative=True, timeout=timeout) + + def afters(self, locator='', timeout=None): + """返回文档中当前元素后面符合条件的元素或节点组成的列表,可用查询语法筛选 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param timeout: 查找超时时间(秒) + :return: 本元素后面的元素或节点组成的列表 + """ + eles1 = self.nexts(locator) + loc = get_loc(locator, True)[1].lstrip('./') + xpath = f'xpath:./following::{loc}' + return eles1 + self.parent_ele._ele(xpath, index=None, relative=True, timeout=timeout) + + def ele(self, locator, index=1, timeout=None): + """返回当前元素下级符合条件的一个元素 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 获取第几个元素,从1开始,可传入负数获取倒数第几个 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: ChromiumElement对象 + """ + return self._ele(locator, timeout, index=index, method='ele()') + + def eles(self, locator, timeout=None): + """返回当前元素下级所有符合条件的子元素 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: ChromiumElement对象组成的列表 + """ + return self._ele(locator, timeout=timeout, index=None) + + def s_ele(self, locator=None, index=1, timeout=None): + """查找一个符合条件的元素以SessionElement形式返回,处理复杂页面时效率很高 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 获取第几个,从1开始,可传入负数获取倒数第几个 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: SessionElement对象或属性、文本 + """ + return (make_session_ele(self, locator, index=index, method='s_ele()') + if self.ele(locator, index=index, timeout=timeout) + else NoneElement(self, method='s_ele()', args={'locator': locator, 'index': index})) + + def s_eles(self, locator, timeout=None): + """查找所有符合条件的元素以SessionElement列表形式返回,处理复杂页面时效率很高 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 查找元素超时时间(秒),默认与元素所在页面等待时间一致 + :return: SessionElement对象 + """ + return (make_session_ele(self, locator, index=None) + if self.ele(locator, timeout=timeout) else SessionElementsList()) + + def _find_elements(self, locator, timeout=None, index=1, relative=False, raise_err=None): + """返回当前元素下级符合条件的子元素、属性或节点文本,默认返回第一个 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 查找元素超时时间(秒) + :param index: 第几个结果,从1开始,可传入负数获取倒数第几个,为None返回所有 + :param relative: MixTab用的表示是否相对定位的参数 + :param raise_err: 找不到元素是是否抛出异常,为None时根据全局设置 + :return: ChromiumElement对象或其组成的列表 + """ + loc = get_loc(locator, css_mode=False) + if loc[0] == 'css selector' and str(loc[1]).startswith(':root'): + loc = loc[0], loc[1][5:] + + def do_find(): + if loc[0] == 'css selector': + if index == 1: + nod_id = self.owner._run_cdp('DOM.querySelector', nodeId=self._node_id, selector=loc[1])['nodeId'] + if nod_id: + r = make_chromium_eles(self.owner, _ids=nod_id, is_obj_id=False) + return None if r is False else r + + else: + nod_ids = self.owner._run_cdp('DOM.querySelectorAll', + nodeId=self._node_id, selector=loc[1])['nodeId'] + r = make_chromium_eles(self.owner, _ids=nod_ids, index=index, is_obj_id=False) + return None if r is False else r + + else: + eles = make_session_ele(self, loc, index=None) + if not eles: + return None + + css = [] + for i in eles: + c = i.css_path + if c in ('html:nth-child(1)', 'html:nth-child(1)>body:nth-child(1)', + 'html:nth-child(1)>body:nth-child(1)>shadow_root:nth-child(1)'): + continue + elif c.startswith('html:nth-child(1)>body:nth-child(1)>shadow_root:nth-child(1)'): + c = c[61:] + css.append(c) + if index is not None: + try: + node_id = self.owner._run_cdp('DOM.querySelector', nodeId=self._node_id, + selector=css[index - 1])['nodeId'] + except IndexError: + return None + r = make_chromium_eles(self.owner, _ids=node_id, is_obj_id=False) + return None if r is False else r + else: + node_ids = [self.owner._run_cdp('DOM.querySelector', nodeId=self._node_id, selector=i)['nodeId'] + for i in css] + node_ids = [i for i in node_ids if i] + if not node_ids: + return None + r = make_chromium_eles(self.owner, _ids=node_ids, index=index, is_obj_id=False) + return None if r is False else r + + timeout = timeout if timeout is not None else self.owner.timeout + end_time = perf_counter() + timeout + result = do_find() + while result is None and perf_counter() <= end_time: + sleep(.1) + result = do_find() + + if result: + return result + return NoneElement(self.owner) if index is not None else [] + + def _get_node_id(self, obj_id): + """返回元素node id""" + return self.owner._run_cdp('DOM.requestNode', objectId=obj_id)['nodeId'] + + def _get_obj_id(self, back_id): + """返回元素object id""" + return self.owner._run_cdp('DOM.resolveNode', backendNodeId=back_id)['object']['objectId'] + + def _get_backend_id(self, node_id): + """返回元素object id""" + r = self.owner._run_cdp('DOM.describeNode', nodeId=node_id)['node'] + self._tag = r['localName'].lower() + return r['backendNodeId'] + + +def find_in_chromium_ele(ele, locator, index=1, timeout=None, relative=True): + """在chromium元素中查找 + :param ele: ChromiumElement对象 + :param locator: 元素定位元组 + :param index: 第几个结果,从1开始,可传入负数获取倒数第几个,为None返回所有 + :param timeout: 查找元素超时时间(秒) + :param relative: MixTab用于标记是否相对定位使用 + :return: 返回ChromiumElement元素或它们组成的列表 + """ + # ---------------处理定位符--------------- + if isinstance(locator, (str, tuple)): + loc = get_loc(locator) + else: + raise ValueError(f"定位符必须为str或长度为2的tuple对象。现在是:{locator}") + + loc_str = loc[1] + if loc[0] == 'xpath' and loc[1].lstrip().startswith('/'): + loc_str = f'.{loc_str}' + elif loc[0] == 'css selector' and loc[1].lstrip().startswith('>'): + loc_str = f'{ele.css_path}{loc[1]}' + loc = loc[0], loc_str + + timeout = timeout if timeout is not None else ele.owner.timeout + + # ---------------执行查找----------------- + if loc[0] == 'xpath': + return find_by_xpath(ele, loc[1], index, timeout, relative=relative) + + else: + return find_by_css(ele, loc[1], index, timeout) + + +def find_by_xpath(ele, xpath, index, timeout, relative=True): + """执行用xpath在元素中查找元素 + :param ele: 在此元素中查找 + :param xpath: 查找语句 + :param index: 第几个结果,从1开始,可传入负数获取倒数第几个,为None返回所有 + :param timeout: 超时时间(秒) + :param relative: 是否相对定位 + :return: ChromiumElement或其组成的列表 + """ + type_txt = '9' if index == 1 else '7' + node_txt = 'this.contentDocument' if ele.tag in __FRAME_ELEMENT__ and not relative else 'this' + js = make_js_for_find_ele_by_xpath(xpath, type_txt, node_txt) + ele.owner.wait.doc_loaded() + + def do_find(): + res = ele.owner._run_cdp('Runtime.callFunctionOn', functionDeclaration=js, objectId=ele._obj_id, + returnByValue=False, awaitPromise=True, userGesture=True) + if res['result']['type'] == 'string': + return res['result']['value'] + if 'exceptionDetails' in res: + if 'The result is not a node set' in res['result']['description']: + js1 = make_js_for_find_ele_by_xpath(xpath, '1', node_txt) + res = ele.owner._run_cdp('Runtime.callFunctionOn', functionDeclaration=js1, objectId=ele._obj_id, + returnByValue=False, awaitPromise=True, userGesture=True) + return res['result']['value'] + else: + raise SyntaxError(f'查询语句错误:\n{res}') + + if res['result']['subtype'] == 'null' or res['result']['description'] in ('NodeList(0)', 'Array(0)'): + return None + + if index == 1: + r = make_chromium_eles(ele.owner, _ids=res['result']['objectId'], is_obj_id=True) + return None if r is False else r + + else: + res = ele.owner._run_cdp('Runtime.getProperties', objectId=res['result']['objectId'], + ownProperties=True)['result'][:-1] + if index is None: + r = ChromiumElementsList(page=ele.owner) + for i in res: + if i['value']['type'] == 'object': + r.append(make_chromium_eles(ele.owner, _ids=i['value']['objectId'], is_obj_id=True)) + else: + r.append(i['value']['value']) + return None if False in r else r + + else: + eles_count = len(res) + if eles_count == 0 or abs(index) > eles_count: + return None + + index1 = eles_count + index + 1 if index < 0 else index + res = res[index1 - 1] + if res['value']['type'] == 'object': + r = make_chromium_eles(ele.owner, _ids=res['value']['objectId'], is_obj_id=True) + else: + r = res['value']['value'] + return None if r is False else r + + end_time = perf_counter() + timeout + result = do_find() + while result is None and perf_counter() < end_time: + sleep(.1) + result = do_find() + + if result: + return result + return NoneElement(ele.owner) if index is not None else ChromiumElementsList(page=ele.owner) + + +def find_by_css(ele, selector, index, timeout): + """执行用css selector在元素中查找元素 + :param ele: 在此元素中查找 + :param selector: 查找语句 + :param index: 第几个结果,从1开始,可传入负数获取倒数第几个,为None返回所有 + :param timeout: 超时时间(秒) + :return: ChromiumElement或其组成的列表 + """ + selector = selector.replace('"', r'\"') + find_all = '' if index == 1 else 'All' + node_txt = 'this.contentDocument' if ele.tag in ('iframe', 'frame', 'shadow-root') else 'this' + js = f'function(){{return {node_txt}.querySelector{find_all}("{selector}");}}' + + ele.owner.wait.doc_loaded() + + def do_find(): + res = ele.owner._run_cdp('Runtime.callFunctionOn', functionDeclaration=js, objectId=ele._obj_id, + returnByValue=False, awaitPromise=True, userGesture=True) + + if 'exceptionDetails' in res: + raise SyntaxError(f'查询语句错误:\n{res}') + if res['result']['subtype'] == 'null' or res['result']['description'] in ('NodeList(0)', 'Array(0)'): + return None + + if index == 1: + r = make_chromium_eles(ele.owner, _ids=res['result']['objectId'], is_obj_id=True) + return None if r is False else r + + else: + obj_ids = [i['value']['objectId'] for i in ele.owner._run_cdp('Runtime.getProperties', + objectId=res['result']['objectId'], + ownProperties=True)['result']] + r = make_chromium_eles(ele.owner, _ids=obj_ids, index=index, is_obj_id=True) + return None if r is False else r + + end_time = perf_counter() + timeout + result = do_find() + while result is None and perf_counter() < end_time: + sleep(.1) + result = do_find() + + if result: + return result + return NoneElement(ele.owner) if index is not None else ChromiumElementsList(page=ele.owner) + + +def make_chromium_eles(page, _ids, index=1, is_obj_id=True, ele_only=False): + """根据node id或object id生成相应元素对象 + :param page: ChromiumPage对象 + :param _ids: 元素的id列表 + :param index: 获取第几个,为None返回全部 + :param is_obj_id: 传入的id是obj id还是node id + :param ele_only: 是否只返回ele,在页面查找元素时生效 + :return: 浏览器元素对象或它们组成的列表,生成失败返回False + """ + if is_obj_id: + get_node_func = _get_node_by_obj_id + else: + get_node_func = _get_node_by_node_id + if not isinstance(_ids, (list, tuple)): + _ids = (_ids,) + + if index is not None: # 获取一个 + if ele_only: + for obj_id in _ids: + tmp = get_node_func(page, obj_id, ele_only) + if tmp is not None: + return tmp + return False + + else: + obj_id = _ids[index - 1] + return get_node_func(page, obj_id, ele_only) + + else: # 获取全部 + nodes = ChromiumElementsList(page=page) + for obj_id in _ids: + # if obj_id == 0: + # continue + tmp = get_node_func(page, obj_id, ele_only) + if tmp is False: + return False + elif tmp is not None: + nodes.append(tmp) + return nodes + + +def _get_node_info(page, id_type, _id): + if not _id: + return False + arg = {id_type: _id} + node = page.driver.run('DOM.describeNode', **arg) + if 'error' in node: + return False + return node + + +def _get_node_by_obj_id(page, obj_id, ele_only): + """根据obj id返回元素对象或文本,ele_only时如果是文本返回None,出错返回False""" + node = _get_node_info(page, 'objectId', obj_id) + if node is False: + return False + if node['node']['nodeName'] in ('#text', '#comment'): + return None if ele_only else node['node']['nodeValue'] + else: + return _make_ele(page, obj_id, node) + + +def _get_node_by_node_id(page, node_id, ele_only): + """根据node id返回元素对象或文本,ele_only时如果是文本返回None,出错返回False""" + node = _get_node_info(page, 'nodeId', node_id) + if node is False: + return False + if node['node']['nodeName'] in ('#text', '#comment'): + return None if ele_only else node['node']['nodeValue'] + else: + obj_id = page.driver.run('DOM.resolveNode', nodeId=node_id) + if 'error' in obj_id: + return False + obj_id = obj_id['object']['objectId'] + return _make_ele(page, obj_id, node) + + +def _make_ele(page, obj_id, node): + ele = ChromiumElement(page, obj_id=obj_id, node_id=node['node']['nodeId'], + backend_id=node['node']['backendNodeId']) + if ele.tag in __FRAME_ELEMENT__: + from .._pages.chromium_frame import ChromiumFrame + ele = ChromiumFrame(page, ele, node) + return ele + + +def make_js_for_find_ele_by_xpath(xpath, type_txt, node_txt): + """生成用xpath在元素中查找元素的js文本 + :param xpath: xpath文本 + :param type_txt: 查找类型 + :param node_txt: 节点类型 + :return: js文本 + """ + for_txt = '' + + # 获取第一个元素、节点或属性 + if type_txt == '9': + return_txt = ''' +if(e.singleNodeValue==null){return null;} +else if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;} +else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;} +else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;} +else{return e.singleNodeValue;}''' + + # 按顺序获取所有元素、节点或属性 + elif type_txt == '7': + for_txt = """ +let a=new Array(); +for(let i = 0; i str: ... + + def __call__(self, + locator: Union[Tuple[str, str], str], + index: int = 1, + timeout: float = None) -> ChromiumElement: ... + + def __eq__(self, other: ChromiumElement) -> bool: ... + + @property + def tag(self) -> str: ... + + @property + def html(self) -> str: ... + + @property + def inner_html(self) -> str: ... + + @property + def attrs(self) -> dict: ... + + @property + def text(self) -> str: ... + + @property + def raw_text(self) -> str: ... + + # -----------------d模式独有属性------------------- + @property + def set(self) -> ChromiumElementSetter: ... + + @property + def states(self) -> ElementStates: ... + + @property + def rect(self) -> ElementRect: ... + + @property + def pseudo(self) -> Pseudo: ... + + @property + def shadow_root(self) -> Union[None, ShadowRoot]: ... + + @property + def sr(self) -> Union[None, ShadowRoot]: ... + + @property + def scroll(self) -> ElementScroller: ... + + @property + def click(self) -> Clicker: ... + + def parent(self, + level_or_loc: Union[tuple, str, int] = 1, + index: int = 1, + timeout: float = 0) -> ChromiumElement: ... + + def child(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElement, str]: ... + + def prev(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElement, str]: ... + + def next(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElement, str]: ... + + def before(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElement, str]: ... + + def after(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElement, str]: ... + + def children(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElementsList, List[Union[ChromiumElement, str]]]: ... + + def prevs(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElementsList, List[Union[ChromiumElement, str]]]: ... + + def nexts(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElementsList, List[Union[ChromiumElement, str]]]: ... + + def befores(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElementsList, List[Union[ChromiumElement, str]]]: ... + + def afters(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[ChromiumElementsList, List[Union[ChromiumElement, str]]]: ... + + def over(self, timeout: float = None) -> ChromiumElement: ... + + def south(self, loc_or_pixel: Union[str, int, None] = None, index: int = 1) -> ChromiumElement: ... + + def north(self, loc_or_pixel: Union[str, int, None] = None, index: int = 1) -> ChromiumElement: ... + + def west(self, loc_or_pixel: Union[str, int, None] = None, index: int = 1) -> ChromiumElement: ... + + def east(self, loc_or_pixel: Union[str, int, None] = None, index: int = 1) -> ChromiumElement: ... + + def offset(self, + locator: Optional[str] = None, + x: int = None, + y: int = None, + timeout: float = None) -> ChromiumElement: ... + + def _get_relative_eles(self, + mode: str = 'north', + locator: Union[int, str] = None, + index: int = 1) -> ChromiumElement: ... + + @property + def wait(self) -> ElementWaiter: ... + + @property + def select(self) -> SelectElement: ... + + @property + def value(self) -> str: ... + + def check(self, uncheck: bool = False, by_js: bool = False) -> None: ... + + def attr(self, name: str) -> Union[str, None]: ... + + def remove_attr(self, name: str) -> None: ... + + def property(self, name: str) -> Union[str, int, None]: ... + + def run_js(self, script: str, *args, as_expr: bool = False, timeout: float = None) -> Any: ... + + def _run_js(self, script: str, *args, as_expr: bool = False, timeout: float = None) -> Any: ... + + def run_async_js(self, script: str, *args, as_expr: bool = False) -> None: ... + + def ele(self, + locator: Union[Tuple[str, str], str], + index: int = 1, + timeout: float = None) -> ChromiumElement: ... + + def eles(self, + locator: Union[Tuple[str, str], str], + timeout: float = None) -> ChromiumElementsList: ... + + def s_ele(self, + locator: Union[Tuple[str, str], str] = None, + index: int = 1, + timeout: float = None) -> SessionElement: ... + + def s_eles(self, + locator: Union[Tuple[str, str], str] = None, + timeout: float = None) -> SessionElementsList: ... + + def _find_elements(self, + locator: Union[Tuple[str, str], str], + timeout: float = None, + index: Optional[int] = 1, + relative: bool = False, + raise_err: bool = False) -> Union[ChromiumElement, ChromiumFrame, ChromiumElementsList]: ... + + def style(self, style: str, pseudo_ele: str = '') -> str: ... + + def src(self, timeout: float = None, base64_to_bytes: bool = True) -> Union[bytes, str, None]: ... + + def save(self, + path: [str, bool] = None, + name: str = None, + timeout: float = None, + rename: bool = True) -> str: ... + + def get_screenshot(self, + path: [str, Path] = None, + name: str = None, + as_bytes: PIC_TYPE = None, + as_base64: PIC_TYPE = None, + scroll_to_center: bool = True) -> Union[str, bytes]: ... + + def input(self, vals: Any, clear: bool = False, by_js: bool = False) -> None: ... + + def _set_file_input(self, files: Union[str, list, tuple]) -> None: ... + + def clear(self, by_js: bool = False) -> None: ... + + def _input_focus(self) -> None: ... + + def focus(self) -> None: ... + + def hover(self, offset_x: int = None, offset_y: int = None) -> None: ... + + def drag(self, offset_x: int = 0, offset_y: int = 0, duration: float = 0.5) -> None: ... + + def drag_to(self, ele_or_loc: Union[Tuple[int, int], str, ChromiumElement], duration: float = 0.5) -> None: ... + + def _get_obj_id(self, node_id: int = None, backend_id: int = None) -> str: ... + + def _get_node_id(self, obj_id: str = None, backend_id: int = None) -> int: ... + + def _get_backend_id(self, node_id: int) -> int: ... + + def _refresh_id(self) -> None: ... + + def _get_ele_path(self, mode: str) -> str: ... + + +class ShadowRoot(BaseElement): + + def __init__(self, parent_ele: ChromiumElement, obj_id: str = None, backend_id: int = None): + self.owner: ChromiumBase = ... + self.tab: Union[ChromiumPage, ChromiumTab] = ... + self._obj_id: str = ... + self._node_id: int = ... + self._backend_id: int = ... + # self.page: ChromiumPage = ... + self.parent_ele: ChromiumElement = ... + self._states: ShadowRootStates = ... + + def __repr__(self) -> str: ... + + def __call__(self, + locator: Union[Tuple[str, str], str], + timeout: float = None) -> ChromiumElement: ... + + def __eq__(self, other: ShadowRoot) -> bool: ... + + @property + def states(self) -> ShadowRootStates: ... + + @property + def tag(self) -> str: ... + + @property + def html(self) -> str: ... + + @property + def inner_html(self) -> str: ... + + def run_js(self, script: str, *args, as_expr: bool = False, timeout: float = None) -> Any: ... + + def _run_js(self, script: str, *args, as_expr: bool = False, timeout: float = None) -> Any: ... + + def run_async_js(self, script: str, *args, as_expr: bool = False, timeout: float = None) -> None: ... + + def parent(self, level_or_loc: Union[str, int] = 1, index: int = 1, timeout: float = 0) -> ChromiumElement: ... + + def child(self, + locator: Union[Tuple[str, str], str] = '', + index: int = 1, timeout:float=None) -> ChromiumElement: ... + + def next(self, + locator: Union[Tuple[str, str], str] = '', + index: int = 1, timeout:float=None) -> ChromiumElement: ... + + def before(self, + locator: Union[Tuple[str, str], str] = '', + index: int = 1, timeout:float=None) -> ChromiumElement: ... + + def after(self, + locator: Union[Tuple[str, str], str] = '', + index: int = 1, timeout:float=None) -> ChromiumElement: ... + + def children(self, locator: Union[Tuple[str, str], str] = '', timeout:float=None) -> List[ChromiumElement]: ... + + def nexts(self, locator: Union[Tuple[str, str], str] = '', timeout:float=None) -> List[ChromiumElement]: ... + + def befores(self, locator: Union[Tuple[str, str], str] = '', timeout:float=None) -> List[ChromiumElement]: ... + + def afters(self, locator: Union[Tuple[str, str], str] = '', timeout:float=None) -> List[ChromiumElement]: ... + + def ele(self, + locator: Union[Tuple[str, str], str], + index: int = 1, + timeout: float = None) -> ChromiumElement: ... + + def eles(self, + locator: Union[Tuple[str, str], str], + timeout: float = None) -> ChromiumElementsList: ... + + def s_ele(self, + locator: Union[Tuple[str, str], str] = None, + index: int = 1, + timeout: float = None) -> SessionElement: ... + + def s_eles(self, locator: Union[Tuple[str, str], str], timeout: float = None) -> SessionElementsList: ... + + def _find_elements(self, + locator: Union[Tuple[str, str], str], + timeout: float = None, + index: Optional[int] = 1, + relative: bool = False, + raise_err: bool = None) -> Union[ChromiumElement, ChromiumFrame, str, ChromiumElementsList]: ... + + def _get_node_id(self, obj_id: str) -> int: ... + + def _get_obj_id(self, back_id: int) -> str: ... + + def _get_backend_id(self, node_id: int) -> int: ... + + +def find_in_chromium_ele(ele: ChromiumElement, + loc: Union[str, Tuple[str, str]], + index: Optional[int] = 1, + timeout: float = None, + relative: bool = True) -> Union[ChromiumElement, List[ChromiumElement]]: ... + + +def find_by_xpath(ele: ChromiumElement, + xpath: str, + index: Optional[int], + timeout: float, + relative: bool = True) -> Union[ChromiumElement, List[ChromiumElement]]: ... + + +def find_by_css(ele: ChromiumElement, + selector: str, + index: Optional[int], + timeout: float) -> Union[ChromiumElement, List[ChromiumElement],]: ... + + +def make_chromium_eles(page: Union[ChromiumBase, ChromiumPage, MixPage, ChromiumTab, ChromiumFrame], + _ids: Union[tuple, list, str, int], + index: Optional[int] = 1, + is_obj_id: bool = True, + ele_only: bool = False + ) -> Union[ChromiumElement, ChromiumFrame, ChromiumElementsList]: ... + + +def make_js_for_find_ele_by_xpath(xpath: str, type_txt: str, node_txt: str) -> str: ... + + +def run_js(page_or_ele: Union[ChromiumBase, ChromiumElement, ShadowRoot], + script: str, + as_expr: bool, + timeout: float, + args: tuple = ...) -> Any: ... + + +def parse_js_result(page: ChromiumBase, + ele: ChromiumElement, + result: dict, + end_time: float): ... + + +def convert_argument(arg: Any) -> dict: ... + + +class Pseudo(object): + def __init__(self, ele: ChromiumElement): + self._ele: ChromiumElement = ... + + @property + def before(self) -> str: ... + + @property + def after(self) -> str: ... diff --git a/src/flaresolverr/DrissionPage/_elements/none_element.py b/src/flaresolverr/DrissionPage/_elements/none_element.py new file mode 100644 index 0000000000..1294da2759 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_elements/none_element.py @@ -0,0 +1,59 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from .._functions.settings import Settings +from ..errors import ElementNotFoundError + + +class NoneElement(object): + def __init__(self, page=None, method=None, args=None): + """ + :param page: 元素所在页面 + :param method: 查找元素的方法 + :param args: 查找元素的参数 + """ + if method and Settings.raise_when_ele_not_found: # 无传入method时不自动抛出,由调用者处理 + raise ElementNotFoundError(None, method=method, arguments=args) + + if page: + self._none_ele_value = page._none_ele_value + self._none_ele_return_value = page._none_ele_return_value + else: + self._none_ele_value = None + self._none_ele_return_value = False + self.method = method + self.args = args + self._get = None + + def __call__(self, *args, **kwargs): + if not self._none_ele_return_value: + raise ElementNotFoundError(None, self.method, self.args) + else: + return self + + def __getattr__(self, item): + if not self._none_ele_return_value: + raise ElementNotFoundError(None, self.method, self.args) + elif item in ('ele', 's_ele', 'parent', 'child', 'next', 'prev', 'before', + 'after', 'get_frame', 'shadow_root', 'sr'): + return self + else: + if item in ('size', 'link', 'css_path', 'xpath', 'comments', 'texts', 'tag', 'html', 'inner_html', + 'attrs', 'text', 'raw_text', 'value', 'attr', 'style', 'src', 'property'): + return self._none_ele_value + else: + raise ElementNotFoundError(None, self.method, self.args) + + def __eq__(self, other): + if other is None: + return True + + def __bool__(self): + return False + + def __repr__(self): + return 'None' diff --git a/src/flaresolverr/DrissionPage/_elements/session_element.py b/src/flaresolverr/DrissionPage/_elements/session_element.py new file mode 100644 index 0000000000..0a4df1900d --- /dev/null +++ b/src/flaresolverr/DrissionPage/_elements/session_element.py @@ -0,0 +1,429 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from html import unescape +from re import match, sub, DOTALL, search + +from lxml.etree import tostring +from lxml.html import HtmlElement, fromstring + +from .none_element import NoneElement +from .._base.base import DrissionElement, BasePage, BaseElement +from .._functions.elements import SessionElementsList +from .._functions.locator import get_loc +from .._functions.web import get_ele_txt, make_absolute_link + + +class SessionElement(DrissionElement): + """session模式的元素对象,包装了一个lxml的Element对象,并封装了常用功能""" + + def __init__(self, ele, owner=None): + """初始化对象 + :param ele: 被包装的HtmlElement元素 + :param owner: 元素所在页面对象,如果是从 html 文本生成的元素,则为 None + """ + super().__init__(owner) + self._inner_ele = ele + self._type = 'SessionElement' + + @property + def inner_ele(self): + return self._inner_ele + + def __repr__(self): + attrs = [f"{k}='{v}'" for k, v in self.attrs.items()] + return f'' + + def __call__(self, locator, index=1, timeout=None): + """在内部查找元素 + 例:ele2 = ele1('@id=ele_id') + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 第几个元素,从1开始,可传入负数获取倒数第几个 + :param timeout: 不起实际作用 + :return: SessionElement对象或属性、文本 + """ + return self.ele(locator, index=index) + + def __eq__(self, other): + return self.xpath == getattr(other, 'xpath', None) + + @property + def tag(self): + """返回元素类型""" + return self._inner_ele.tag + + @property + def html(self): + """返回outerHTML文本""" + html = tostring(self._inner_ele, method="html").decode() + return unescape(html[:html.rfind('>') + 1]) # tostring()会把跟紧元素的文本节点也带上,因此要去掉 + + @property + def inner_html(self): + """返回元素innerHTML文本""" + r = match(r'<.*?>(.*)', self.html, flags=DOTALL) + return '' if not r else r.group(1) + + @property + def attrs(self): + """返回元素所有属性及值""" + return {attr: self.attr(attr) for attr, val in self.inner_ele.items()} + + @property + def text(self): + """返回元素内所有文本""" + return get_ele_txt(self) + + @property + def raw_text(self): + """返回未格式化处理的元素内文本""" + return str(self._inner_ele.text_content()) + + def parent(self, level_or_loc=1, index=1): + """返回上面某一级父元素,可指定层数或用查询语法定位 + :param level_or_loc: 第几级父元素,或定位符 + :param index: 当level_or_loc传入定位符,使用此参数选择第几个结果 + :return: 上级元素对象 + """ + return super().parent(level_or_loc, index) + + def child(self, locator='', index=1, timeout=None, ele_only=True): + """返回当前元素的一个符合条件的直接子元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 直接子元素或节点文本 + """ + return super().child(locator, index, timeout, ele_only=ele_only) + + def prev(self, locator='', index=1, timeout=None, ele_only=True): + """返回当前元素前面一个符合条件的同级元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 同级元素 + """ + return super().prev(locator, index, timeout, ele_only=ele_only) + + def next(self, locator='', index=1, timeout=None, ele_only=True): + """返回当前元素后面一个符合条件的同级元素,可用查询语法筛选,可指定返回筛选结果的第几个 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 同级元素 + """ + return super().next(locator, index, timeout, ele_only=ele_only) + + def before(self, locator='', index=1, timeout=None, ele_only=True): + """返回文档中当前元素前面符合条件的一个元素,可用查询语法筛选,可指定返回筛选结果的第几个 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param index: 前面第几个查询结果,1开始 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的某个元素或节点 + """ + return super().before(locator, index, timeout, ele_only=ele_only) + + def after(self, locator='', index=1, timeout=None, ele_only=True): + """返回文档中此当前元素后面符合条件的一个元素,可用查询语法筛选,可指定返回筛选结果的第几个 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param index: 第几个查询结果,1开始 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素后面的某个元素或节点 + """ + return super().after(locator, index, timeout, ele_only=ele_only) + + def children(self, locator='', timeout=0, ele_only=True): + """返回当前元素符合条件的直接子元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 直接子元素或节点文本组成的列表 + """ + return SessionElementsList(self.owner, super().children(locator, timeout, ele_only=ele_only)) + + def prevs(self, locator='', timeout=None, ele_only=True): + """返回当前元素前面符合条件的同级元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 同级元素或节点文本组成的列表 + """ + return SessionElementsList(self.owner, super().prevs(locator, timeout, ele_only=ele_only)) + + def nexts(self, locator='', timeout=None, ele_only=True): + """返回当前元素后面符合条件的同级元素或节点组成的列表,可用查询语法筛选 + :param locator: 用于筛选的查询语法 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 同级元素或节点文本组成的列表 + """ + return SessionElementsList(self.owner, super().nexts(locator, timeout, ele_only=ele_only)) + + def befores(self, locator='', timeout=None, ele_only=True): + """返回文档中当前元素前面符合条件的元素或节点组成的列表,可用查询语法筛选 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素前面的元素或节点组成的列表 + """ + return SessionElementsList(self.owner, super().befores(locator, timeout, ele_only=ele_only)) + + def afters(self, locator='', timeout=None, ele_only=True): + """返回文档中当前元素后面符合条件的元素或节点组成的列表,可用查询语法筛选 + 查找范围不限同级元素,而是整个DOM文档 + :param locator: 用于筛选的查询语法 + :param timeout: 此参数不起实际作用 + :param ele_only: 是否只获取元素,为False时把文本、注释节点也纳入 + :return: 本元素后面的元素或节点组成的列表 + """ + return SessionElementsList(self.owner, super().afters(locator, timeout, ele_only=ele_only)) + + def attr(self, name): + """返回attribute属性值 + :param name: 属性名 + :return: 属性值文本,没有该属性返回None + """ + # 获取href属性时返回绝对url + if name == 'href': + link = self.inner_ele.get('href') + # 若为链接为None、js或邮件,直接返回 + if not link or link.lower().startswith(('javascript:', 'mailto:')): + return link + else: # 其它情况直接返回绝对url + return make_absolute_link(link, self.owner.url) if self.owner else link + + elif name == 'src': + return make_absolute_link(self.inner_ele.get('src'), + self.owner.url) if self.owner else self.inner_ele.get('src') + + elif name == 'text': + return self.text + + elif name == 'innerText': + return self.raw_text + + elif name in ('html', 'outerHTML'): + return self.html + + elif name == 'innerHTML': + return self.inner_html + + else: + return self.inner_ele.get(name) + + def ele(self, locator, index=1, timeout=None): + """返回当前元素下级符合条件的一个元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 第几个元素,从1开始,可传入负数获取倒数第几个 + :param timeout: 不起实际作用 + :return: SessionElement对象或属性、文本 + """ + return self._ele(locator, index=index, method='ele()') + + def eles(self, locator, timeout=None): + """返回当前元素下级所有符合条件的子元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 不起实际作用 + :return: SessionElement对象或属性、文本组成的列表 + """ + return self._ele(locator, index=None) + + def s_ele(self, locator=None, index=1): + """返回当前元素下级符合条件的一个元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param index: 获取第几个,从1开始,可传入负数获取倒数第几个 + :return: SessionElement对象或属性、文本 + """ + return self._ele(locator, index=index, method='s_ele()') + + def s_eles(self, locator): + """返回当前元素下级所有符合条件的子元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :return: SessionElement对象或属性、文本组成的列表 + """ + return self._ele(locator, index=None) + + def _find_elements(self, locator, timeout=None, index=1, relative=False, raise_err=None): + """返回当前元素下级符合条件的子元素、属性或节点文本 + :param locator: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 不起实际作用,用于和父类对应 + :param index: 第几个结果,从1开始,可传入负数获取倒数第几个,为None返回所有 + :param relative: MixTab用的表示是否相对定位的参数 + :param raise_err: 找不到元素是是否抛出异常,为None时根据全局设置 + :return: SessionElement对象 + """ + return make_session_ele(self, locator, index=index) + + def _get_ele_path(self, mode): + """获取css路径或xpath路径 + :param mode: 'css' 或 'xpath' + :return: css路径或xpath路径 + """ + path_str = '' + ele = self + + while ele: + if mode == 'css': + id_ = ele.attr('id') + if id_: + path_str = f'>{ele.tag}#{id_}{path_str}' + break + brothers = len(ele.eles(f'xpath:./preceding-sibling::*')) + path_str = f'>{ele.tag}:nth-child({brothers + 1}){path_str}' + else: + brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}')) + path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}' + + ele = ele.parent() + + return f'{path_str[1:]}' if mode == 'css' else path_str + + +def make_session_ele(html_or_ele, loc=None, index=1, method=None): + """从接收到的对象或html文本中查找元素,返回SessionElement对象 + 如要直接从html生成SessionElement而不在下级查找,loc输入None即可 + :param html_or_ele: html文本、BaseParser对象 + :param loc: 定位元组或字符串,为None时不在下级查找,返回根元素 + :param index: 获取第几个元素,从1开始,可传入负数获取倒数第几个,None获取所有 + :param method: 调用此方法的方法 + :return: 返回SessionElement元素或列表,或属性文本 + """ + # ---------------处理定位符--------------- + if not loc: + if isinstance(html_or_ele, SessionElement): + return html_or_ele + loc = ('xpath', '.') + + elif isinstance(loc, (str, tuple)): + loc = get_loc(loc) + + else: + raise ValueError("定位符必须为str或长度为2的tuple。") + + # ---------------根据传入对象类型获取页面对象和lxml元素对象--------------- + # 直接传入html文本 + if isinstance(html_or_ele, str): + page = None + html_or_ele = fromstring(html_or_ele) + + # SessionElement + elif html_or_ele._type == 'SessionElement': + page = html_or_ele.owner + + loc_str = loc[1] + if loc[0] == 'xpath' and loc[1].lstrip().startswith('/'): + loc_str = f'.{loc[1]}' + html_or_ele = html_or_ele.inner_ele + + # 若css以>开头,表示找元素的直接子元素,要用page以绝对路径才能找到 + elif loc[0] == 'css selector' and loc[1].lstrip().startswith('>'): + loc_str = f'{html_or_ele.css_path}{loc[1]}' + if html_or_ele.owner: + html_or_ele = fromstring(html_or_ele.owner.html) + else: # 接收html文本,无page的情况 + html_or_ele = fromstring(html_or_ele('xpath:/ancestor::*').html) + + else: + html_or_ele = html_or_ele.inner_ele + + loc = loc[0], loc_str + + elif html_or_ele._type == 'ChromiumElement': + loc_str = loc[1] + if loc[0] == 'xpath' and loc[1].lstrip().startswith('/'): + loc_str = f'.{loc[1]}' + elif loc[0] == 'css selector' and loc[1].lstrip().startswith('>'): + loc_str = f'{html_or_ele.css_path}{loc[1]}' + loc = loc[0], loc_str + + # 获取整个页面html再定位到当前元素,以实现查找上级元素 + page = html_or_ele.owner + xpath = html_or_ele.xpath + # ChromiumElement,兼容传入的元素在iframe内的情况 + if html_or_ele._doc_id is None: + doc = html_or_ele._run_js('return this.ownerDocument;') + html_or_ele._doc_id = doc['objectId'] if doc else False + + if html_or_ele._doc_id: + html = html_or_ele.owner._run_cdp('DOM.getOuterHTML', objectId=html_or_ele._doc_id)['outerHTML'] + else: + html = html_or_ele.owner.html + html_or_ele = fromstring(html) + html_or_ele = html_or_ele.xpath(xpath)[0] + + elif html_or_ele._type == 'ChromiumFrame': + page = html_or_ele + html_or_ele = fromstring(html_or_ele.inner_html) + + # 各种页面对象 + elif isinstance(html_or_ele, BasePage): + page = html_or_ele + html = html_or_ele.html + if html.startswith('', '', html) + html_or_ele = fromstring(html) + + # ShadowRoot + elif isinstance(html_or_ele, BaseElement): + page = html_or_ele.owner + html = html_or_ele.html + r = search(r'^[ \n]*?[ \n]*?(.*?)[ \n]*?[ \n]*?$', html) + if r: + html = r.group(1) + html_or_ele = fromstring(html) + + else: + raise TypeError('html_or_ele参数只能是元素、页面对象或html文本。') + + # ---------------执行查找----------------- + try: + if loc[0] == 'xpath': # 用lxml内置方法获取lxml的元素对象列表 + eles = html_or_ele.xpath(loc[1]) + else: # 用css selector获取元素对象列表 + eles = html_or_ele.cssselect(loc[1]) + + if not isinstance(eles, list): # 结果不是列表,如数字 + return eles + + # 把lxml元素对象包装成SessionElement对象并按需要返回一个或全部 + if index is None: + r = SessionElementsList(page=page) + for e in eles: + if e != '\n': + r.append(SessionElement(e, page) if isinstance(e, HtmlElement) else e) + return r + + else: + eles_count = len(eles) + if eles_count == 0 or abs(index) > eles_count: + return NoneElement(page, method=method, args={'locator': loc, 'index': index}) + if index < 0: + index = eles_count + index + 1 + + ele = eles[index - 1] + if isinstance(ele, HtmlElement): + return SessionElement(ele, page) + elif isinstance(ele, str): + return ele + else: + return NoneElement(page, method=method, args={'locator': loc, 'index': index}) + + except Exception as e: + if 'Invalid expression' in str(e): + raise SyntaxError(f'无效的xpath语句:{loc}') + elif 'Expected selector' in str(e): + raise SyntaxError(f'无效的css select语句:{loc}') + + raise e diff --git a/src/flaresolverr/DrissionPage/_elements/session_element.pyi b/src/flaresolverr/DrissionPage/_elements/session_element.pyi new file mode 100644 index 0000000000..185cfae029 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_elements/session_element.pyi @@ -0,0 +1,147 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from typing import Union, List, Tuple, Optional + +from lxml.html import HtmlElement + +from .._base.base import DrissionElement, BaseElement +from .._elements.chromium_element import ChromiumElement +from .._functions.elements import SessionElementsList +from .._pages.chromium_base import ChromiumBase +from .._pages.chromium_frame import ChromiumFrame +from .._pages.session_page import SessionPage + + +class SessionElement(DrissionElement): + + def __init__(self, ele: HtmlElement, owner: Union[SessionPage, None] = None): + self._inner_ele: HtmlElement = ... + self.owner: SessionPage = ... + self.page: SessionPage = ... + + @property + def inner_ele(self) -> HtmlElement: ... + + def __repr__(self) -> str: ... + + def __call__(self, + locator: Union[Tuple[str, str], str], + index: int = 1, + timeout: float = None) -> SessionElement: ... + + def __eq__(self, other: SessionElement) -> bool: ... + + @property + def tag(self) -> str: ... + + @property + def html(self) -> str: ... + + @property + def inner_html(self) -> str: ... + + @property + def attrs(self) -> dict: ... + + @property + def text(self) -> str: ... + + @property + def raw_text(self) -> str: ... + + def parent(self, + level_or_loc: Union[tuple, str, int] = 1, + index: int = 1) -> SessionElement: ... + + def child(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[SessionElement, str]: ... + + def prev(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[SessionElement, str]: ... + + def next(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[SessionElement, str]: ... + + def before(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[SessionElement, str]: ... + + def after(self, + locator: Union[Tuple[str, str], str, int] = '', + index: int = 1, + timeout: float = None, + ele_only: bool = True) -> Union[SessionElement, str]: ... + + def children(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[SessionElementsList, List[Union[SessionElement, str]]]: ... + + def prevs(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[SessionElementsList, List[Union[SessionElement, str]]]: ... + + def nexts(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[SessionElementsList, List[Union[SessionElement, str]]]: ... + + def befores(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[SessionElementsList, List[Union[SessionElement, str]]]: ... + + def afters(self, + locator: Union[Tuple[str, str], str] = '', + timeout: float = None, + ele_only: bool = True) -> Union[SessionElementsList, List[Union[SessionElement, str]]]: ... + + def attr(self, name: str) -> Optional[str]: ... + + def ele(self, + locator: Union[Tuple[str, str], str], + index: int = 1, + timeout: float = None) -> SessionElement: ... + + def eles(self, + locator: Union[Tuple[str, str], str], + timeout: float = None) -> SessionElementsList: ... + + def s_ele(self, + locator: Union[Tuple[str, str], str] = None, + index: int = 1) -> SessionElement: ... + + def s_eles(self, locator: Union[Tuple[str, str], str]) -> SessionElementsList: ... + + def _find_elements(self, + locator: Union[Tuple[str, str], str], + timeout: float = None, + index: Optional[int] = 1, + relative: bool = False, + raise_err: bool = None) -> Union[SessionElement, SessionElementsList]: ... + + def _get_ele_path(self, mode: str) -> str: ... + + +def make_session_ele(html_or_ele: Union[str, SessionElement, SessionPage, ChromiumElement, BaseElement, ChromiumFrame, +ChromiumBase], + loc: Union[str, Tuple[str, str]] = None, + index: Optional[int] = 1, + method: Optional[str] = None) -> Union[SessionElement, SessionElementsList]: ... diff --git a/src/flaresolverr/DrissionPage/_functions/browser.py b/src/flaresolverr/DrissionPage/_functions/browser.py new file mode 100644 index 0000000000..83048ff84e --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/browser.py @@ -0,0 +1,339 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from json import load, dump, JSONDecodeError +from os import environ +from pathlib import Path +from shutil import rmtree +from subprocess import Popen, DEVNULL +from tempfile import gettempdir +from time import perf_counter, sleep + +from requests import Session + +from .tools import port_is_using +from .._configs.options_manage import OptionsManager +from ..errors import BrowserConnectError + + +def connect_browser(option): + """连接或启动浏览器 + :param option: ChromiumOptions对象 + :return: 返回是否接管的浏览器 + """ + address = option.address.replace('localhost', '127.0.0.1').lstrip('http://').lstrip('https://') + browser_path = option.browser_path + + ip, port = address.split(':') + using = port_is_using(ip, port) + if ip != '127.0.0.1' or using or option.is_existing_only: + if test_connect(ip, port): + return True + elif ip != '127.0.0.1': + raise BrowserConnectError(f'\n{address}浏览器连接失败。') + elif using: + raise BrowserConnectError(f'\n{address}浏览器连接失败,请检查{port}端口是否浏览器,' + f'且已添加\'--remote-debugging-port={port}\'启动项。') + else: # option.is_existing_only + raise BrowserConnectError(f'\n{address}浏览器连接失败,请确认浏览器已启动。') + + # ----------创建浏览器进程---------- + args, user_path = get_launch_args(option) + if option._new_env: + rmtree(user_path, ignore_errors=True) + set_prefs(option) + set_flags(option) + try: + _run_browser(port, browser_path, args) + + # 传入的路径找不到,主动在ini文件、注册表、系统变量中找 + except FileNotFoundError: + browser_path = get_chrome_path(option.ini_path) + if not browser_path: + raise FileNotFoundError('无法找到浏览器可执行文件路径,请手动配置。') + _run_browser(port, browser_path, args) + + if not test_connect(ip, port): + raise BrowserConnectError(f'\n{address}浏览器连接失败。\n请确认:\n' + f'1、用户文件夹没有和已打开的浏览器冲突\n' + f'2、如为无界面系统,请添加\'--headless=new\'启动参数\n' + f'3、如果是Linux系统,尝试添加\'--no-sandbox\'启动参数\n' + f'可使用ChromiumOptions设置端口和用户文件夹路径。') + return False + + +def get_launch_args(opt): + """从ChromiumOptions获取命令行启动参数 + :param opt: ChromiumOptions + :return: 启动参数列表 + """ + # ----------处理arguments----------- + result = set() + user_path = False + for i in opt.arguments: + if i.startswith(('--load-extension=', '--remote-debugging-port=')): + continue + elif i.startswith('--user-data-dir') and not opt.system_user_path: + user_path = f'--user-data-dir={Path(i[16:]).absolute()}' + result.add(user_path) + continue + elif i.startswith('--user-agent='): + opt._ua_set = True + result.add(i) + + if not user_path and not opt.system_user_path: + port = opt.address.split(':')[-1] if opt.address else '0' + p = Path(opt.tmp_path) if opt.tmp_path else Path(gettempdir()) / 'DrissionPage' + path = p / 'userData' / port + path.mkdir(parents=True, exist_ok=True) + user_path = path.absolute() + opt.set_user_data_path(user_path) + result.add(f'--user-data-dir={user_path}') + + result = list(result) + + # ----------处理插件extensions------------- + ext = [str(Path(e).absolute()) for e in opt.extensions] + if ext: + ext = ','.join(set(ext)) + ext = f'--load-extension={ext}' + result.append(ext) + + return result, user_path + + +def set_prefs(opt): + """处理启动配置中的prefs项,目前只能对已存在文件夹配置 + :param opt: ChromiumOptions + :return: None + """ + if not opt.user_data_path or (not opt.preferences and not opt._prefs_to_del): + return + prefs = opt.preferences + del_list = opt._prefs_to_del + + user = 'Default' + for arg in opt.arguments: + if arg.startswith('--profile-directory'): + user = arg.split('=')[-1].strip() + break + + prefs_file = Path(opt.user_data_path) / user / 'Preferences' + + if not prefs_file.exists(): + prefs_file.parent.mkdir(parents=True, exist_ok=True) + with open(prefs_file, 'w') as f: + f.write('{}') + + with open(prefs_file, "r", encoding='utf-8') as f: + try: + prefs_dict = load(f) + except JSONDecodeError: + prefs_dict = {} + + for pref in prefs: + value = prefs[pref] + pref = pref.split('.') + _make_leave_in_dict(prefs_dict, pref, 0, len(pref)) + _set_value_to_dict(prefs_dict, pref, value) + + for pref in del_list: + _remove_arg_from_dict(prefs_dict, pref) + + with open(prefs_file, 'w', encoding='utf-8') as f: + dump(prefs_dict, f) + + +def set_flags(opt): + """处理启动配置中的flags项 + :param opt: ChromiumOptions + :return: None + """ + if not opt.user_data_path or (not opt.clear_file_flags and not opt.flags): + return + + state_file = Path(opt.user_data_path) / 'Local State' + + if not state_file.exists(): + state_file.parent.mkdir(parents=True, exist_ok=True) + with open(state_file, 'w') as f: + f.write('{}') + + with open(state_file, "r", encoding='utf-8') as f: + try: + states_dict = load(f) + except JSONDecodeError: + states_dict = {} + states_dict.setdefault('browser', {}).setdefault('enabled_labs_experiments', []) + flags_list = [] if opt.clear_file_flags else states_dict['browser']['enabled_labs_experiments'] + flags_dict = {} + for i in flags_list: + f = str(i).split('@', 1) + flags_dict[f[0]] = None if len(f) == 1 else f[1] + + for k, i in opt.flags.items(): + flags_dict[k] = i + + states_dict['browser']['enabled_labs_experiments'] = [f'{k}@{i}' if i else k for k, i in flags_dict.items()] + + with open(state_file, 'w', encoding='utf-8') as f: + dump(states_dict, f) + + +def test_connect(ip, port, timeout=30): + """测试浏览器是否可用 + :param ip: 浏览器ip + :param port: 浏览器端口 + :param timeout: 超时时间(秒) + :return: None + """ + end_time = perf_counter() + timeout + s = Session() + s.trust_env = False + s.keep_alive = False + while perf_counter() < end_time: + try: + r = s.get(f'http://{ip}:{port}/json', timeout=10, headers={'Connection': 'close'}) + for tab in r.json(): + if tab['type'] in ('page', 'webview'): + r.close() + s.close() + return True + r.close() + except Exception: + sleep(.2) + + s.close() + return False + + +def _run_browser(port, path: str, args) -> Popen: + """创建浏览器进程 + :param port: 端口号 + :param path: 浏览器路径 + :param args: 启动参数 + :return: 进程对象 + """ + p = Path(path) + p = str(p / 'chrome') if p.is_dir() else str(path) + arguments = [p, f'--remote-debugging-port={port}'] + arguments.extend(args) + try: + return Popen(arguments, shell=False, stdout=DEVNULL, stderr=DEVNULL) + except FileNotFoundError: + raise FileNotFoundError('未找到浏览器,请手动指定浏览器可执行文件路径。') + + +def _make_leave_in_dict(target_dict: dict, src: list, num: int, end: int) -> None: + """把prefs中a.b.c形式的属性转为a['b']['c']形式 + :param target_dict: 要处理的字典 + :param src: 属性层级列表[a, b, c] + :param num: 当前处理第几个 + :param end: src长度 + :return: None + """ + if num == end: + return + if src[num] not in target_dict: + target_dict[src[num]] = {} + num += 1 + _make_leave_in_dict(target_dict[src[num - 1]], src, num, end) + + +def _set_value_to_dict(target_dict: dict, src: list, value) -> None: + """把a.b.c形式的属性的值赋值到a['b']['c']形式的字典中 + :param target_dict: 要处理的字典 + :param src: 属性层级列表[a, b, c] + :param value: 属性值 + :return: None + """ + src = "']['".join(src) + src = f"target_dict['{src}']=value" + exec(src) + + +def _remove_arg_from_dict(target_dict: dict, arg: str) -> None: + """把a.b.c形式的属性从字典中删除 + :param target_dict: 要处理的字典 + :param arg: 层级属性,形式'a.b.c' + :return: None + """ + args = arg.split('.') + args = [f"['{i}']" for i in args] + src = ''.join(args) + src = f"target_dict{src}" + try: + exec(src) + src = ''.join(args[:-1]) + src = f"target_dict{src}.pop({args[-1][1:-1]})" + exec(src) + except: + pass + + +def get_chrome_path(ini_path): + """从ini文件或系统变量中获取chrome可执行文件的路径""" + # -----------从ini文件中获取-------------- + if ini_path and Path(ini_path).exists(): + path = OptionsManager(ini_path).chromium_options.get('browser_path', None) + if path and Path(path).is_file(): + return str(path) + + # -----------使用which获取----------- + from shutil import which + path = (which('chrome') or which('chromium') or which('google-chrome') or which('google-chrome-stable') + or which('google-chrome-unstable') or which('google-chrome-beta')) + if path: + return path + + # -----------从MAC和Linux默认路径获取----------- + from platform import system + sys = system().lower() + if sys in ('macos', 'darwin'): + p = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' + return p if Path(p).exists() else None + + elif sys == 'linux': + paths = ('/usr/bin/google-chrome', '/opt/google/chrome/google-chrome', + '/user/lib/chromium-browser/chromium-browser') + for p in paths: + if Path(p).exists(): + return p + return None + + elif sys != 'windows': + return None + + # -----------从注册表中获取-------------- + from winreg import OpenKey, EnumValue, CloseKey, HKEY_CURRENT_USER, HKEY_LOCAL_MACHINE, KEY_READ + txt = r'SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe' + try: + key = OpenKey(HKEY_CURRENT_USER, txt, reserved=0, access=KEY_READ) + k = EnumValue(key, 0) + CloseKey(key) + if k[1]: + return k[1] + + except (FileNotFoundError, OSError): + try: + key = OpenKey(HKEY_LOCAL_MACHINE, txt, reserved=0, access=KEY_READ) + k = EnumValue(key, 0) + CloseKey(key) + if k[1]: + return k[1] + + except (FileNotFoundError, OSError): + pass + + # -----------从系统变量中获取-------------- + for path in environ.get('PATH', '').split(';'): + path = Path(path) / 'chrome.exe' + try: + if path.exists(): + return str(path) + except OSError: + pass diff --git a/src/flaresolverr/DrissionPage/_functions/browser.pyi b/src/flaresolverr/DrissionPage/_functions/browser.pyi new file mode 100644 index 0000000000..dcb46081b5 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/browser.pyi @@ -0,0 +1,28 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from typing import Union + +from .._configs.chromium_options import ChromiumOptions + + +def connect_browser(option: ChromiumOptions) -> bool: ... + + +def get_launch_args(opt: ChromiumOptions) -> list: ... + + +def set_prefs(opt: ChromiumOptions) -> None: ... + + +def set_flags(opt: ChromiumOptions) -> None: ... + + +def test_connect(ip: str, port: Union[int, str], timeout: float = 30) -> bool: ... + + +def get_chrome_path(ini_path: str) -> Union[str, None]: ... diff --git a/src/flaresolverr/DrissionPage/_functions/by.py b/src/flaresolverr/DrissionPage/_functions/by.py new file mode 100644 index 0000000000..ec6b07c877 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/by.py @@ -0,0 +1,18 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" + + +class By: + ID = 'id' + XPATH = 'xpath' + LINK_TEXT = 'link text' + PARTIAL_LINK_TEXT = 'partial link text' + NAME = 'name' + TAG_NAME = 'tag name' + CLASS_NAME = 'class name' + CSS_SELECTOR = 'css selector' diff --git a/src/flaresolverr/DrissionPage/_functions/cli.py b/src/flaresolverr/DrissionPage/_functions/cli.py new file mode 100644 index 0000000000..2cb6f61c01 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/cli.py @@ -0,0 +1,53 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from click import command, option + +from .._functions.tools import configs_to_here as ch +from .._configs.chromium_options import ChromiumOptions +from .._pages.chromium_page import ChromiumPage + + +@command() +@option("-p", "--set-browser-path", help="设置浏览器路径") +@option("-u", "--set-user-path", help="设置用户数据路径") +@option("-c", "--configs-to-here", is_flag=True, help="复制默认配置文件到当前路径") +@option("-l", "--launch-browser", default=-1, help="启动浏览器,传入端口号,0表示用配置文件中的值") +def main(set_browser_path, set_user_path, configs_to_here, launch_browser): + if set_browser_path: + set_paths(browser_path=set_browser_path) + + if set_user_path: + set_paths(user_data_path=set_user_path) + + if configs_to_here: + ch() + + if launch_browser >= 0: + port = f'127.0.0.1:{launch_browser}' if launch_browser else None + ChromiumPage(port) + + +def set_paths(browser_path=None, user_data_path=None): + """快捷的路径设置函数 + :param browser_path: 浏览器可执行文件路径 + :param user_data_path: 用户数据路径 + :return: None + """ + co = ChromiumOptions() + + if browser_path is not None: + co.set_browser_path(browser_path) + + if user_data_path is not None: + co.set_user_data_path(user_data_path) + + co.save() + + +if __name__ == '__main__': + main() diff --git a/src/flaresolverr/DrissionPage/_functions/cookies.py b/src/flaresolverr/DrissionPage/_functions/cookies.py new file mode 100644 index 0000000000..b82b1d4e57 --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/cookies.py @@ -0,0 +1,234 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from datetime import datetime +from http.cookiejar import Cookie, CookieJar + +from tldextract import extract + + +def cookie_to_dict(cookie): + """把Cookie对象转为dict格式 + :param cookie: Cookie对象、字符串或字典 + :return: cookie字典 + """ + if isinstance(cookie, Cookie): + cookie_dict = cookie.__dict__.copy() + cookie_dict.pop('rfc2109', None) + cookie_dict.pop('_rest', None) + return cookie_dict + + elif isinstance(cookie, dict): + cookie_dict = cookie + + elif isinstance(cookie, str): + cookie_dict = {} + for attr in cookie.strip().rstrip(';,').split(',' if ',' in cookie else ';'): + attr_val = attr.strip().split('=', 1) + if attr_val[0] in ('domain', 'path', 'expires', 'max-age', 'HttpOnly', 'secure', 'expiry', 'name', 'value'): + cookie_dict[attr_val[0]] = attr_val[1] if len(attr_val) == 2 else '' + else: + cookie_dict['name'] = attr_val[0] + cookie_dict['value'] = attr_val[1] if len(attr_val) == 2 else '' + + return cookie_dict + + else: + raise TypeError('cookie参数必须为Cookie、str或dict类型。') + + return cookie_dict + + +def cookies_to_tuple(cookies): + """把cookies转为tuple格式 + :param cookies: cookies信息,可为CookieJar, list, tuple, str, dict + :return: 返回tuple形式的cookies + """ + if isinstance(cookies, (list, tuple, CookieJar)): + cookies = tuple(cookie_to_dict(cookie) for cookie in cookies) + + elif isinstance(cookies, str): + c_dict = {} + cookies = cookies.rstrip('; ') + cookies = cookies.split(';') + + for attr in cookies: + attr_val = attr.strip().split('=', 1) + c_dict[attr_val[0]] = attr_val[1] if len(attr_val) == 2 else True + cookies = _dict_cookies_to_tuple(c_dict) + + elif isinstance(cookies, dict): + cookies = _dict_cookies_to_tuple(cookies) + + elif isinstance(cookies, Cookie): + cookies = (cookie_to_dict(cookies),) + + else: + raise TypeError('cookies参数必须为Cookie、CookieJar、list、tuple、str或dict类型。') + + return cookies + + +def set_session_cookies(session, cookies): + """设置Session对象的cookies + :param session: Session对象 + :param cookies: cookies信息 + :return: None + """ + for cookie in cookies_to_tuple(cookies): + if cookie['value'] is None: + cookie['value'] = '' + + kwargs = {x: cookie[x] for x in cookie + if x.lower() in ('version', 'port', 'domain', 'path', 'secure', + 'expires', 'discard', 'comment', 'comment_url', 'rest')} + + if 'expiry' in cookie: + kwargs['expires'] = cookie['expiry'] + + session.cookies.set(cookie['name'], cookie['value'], **kwargs) + + +def set_browser_cookies(browser, cookies): + """设置cookies值 + :param browser: 页面对象 + :param cookies: cookies信息 + :return: None + """ + c = [] + for cookie in cookies_to_tuple(cookies): + if 'domain' not in cookie and 'url' not in cookie: + raise ValueError(f"cookie必须带有'domain'或'url'字段:{cookie}") + c.append(format_cookie(cookie)) + browser._run_cdp('Storage.setCookies', cookies=c) + + +def set_tab_cookies(page, cookies): + """设置cookies值 + :param page: 页面对象 + :param cookies: cookies信息 + :return: None + """ + for cookie in cookies_to_tuple(cookies): + cookie = format_cookie(cookie) + + if cookie['name'].startswith('__Host-'): + if not page.url.startswith('http'): + cookie['name'] = cookie['name'].replace('__Host-', '__Secure-', 1) + else: + cookie['url'] = page.url + page._run_cdp_loaded('Network.setCookie', **cookie) + continue # 不用设置域名,可退出 + + if cookie.get('domain', None): + try: + page._run_cdp_loaded('Network.setCookie', **cookie) + if not is_cookie_in_driver(page, cookie): + page.browser.set.cookies(cookie) + continue + except Exception: + pass + + url = page._browser_url + if not url.startswith('http'): + raise RuntimeError(f'未设置域名,请设置cookie的domain参数或先访问一个网站。{cookie}') + ex_url = extract(url) + d_list = ex_url.subdomain.split('.') + d_list.append(f'{ex_url.domain}.{ex_url.suffix}' if ex_url.suffix else ex_url.domain) + + tmp = [d_list[0]] + if len(d_list) > 1: + for i in d_list[1:]: + tmp.append('.') + tmp.append(i) + + for i in range(len(tmp)): + cookie['domain'] = ''.join(tmp[i:]) + page._run_cdp_loaded('Network.setCookie', **cookie) + if is_cookie_in_driver(page, cookie): + break + + +def is_cookie_in_driver(page, cookie): + """查询cookie是否在浏览器内 + :param page: BasePage对象 + :param cookie: dict格式cookie + :return: bool + """ + if 'domain' in cookie: + for c in page.cookies(all_domains=True): + if cookie['name'] == c['name'] and cookie['value'] == c['value'] and cookie['domain'] == c.get('domain', + None): + return True + else: + for c in page.cookies(all_domains=True): + if cookie['name'] == c['name'] and cookie['value'] == c['value']: + return True + return False + + +def format_cookie(cookie): + """设置cookie为可用格式 + :param cookie: dict格式cookie + :return: 格式化后的cookie字典 + """ + if 'expiry' in cookie: + cookie['expires'] = int(cookie['expiry']) + cookie.pop('expiry') + + if 'expires' in cookie: + if not cookie['expires']: + cookie.pop('expires') + + elif isinstance(cookie['expires'], str): + if cookie['expires'].isdigit(): + cookie['expires'] = int(cookie['expires']) + + elif cookie['expires'].replace('.', '').isdigit(): + cookie['expires'] = float(cookie['expires']) + + else: + try: + cookie['expires'] = datetime.strptime(cookie['expires'], '%a, %d %b %Y %H:%M:%S GMT').timestamp() + except ValueError: + cookie['expires'] = datetime.strptime(cookie['expires'], '%a, %d %b %y %H:%M:%S GMT').timestamp() + + if cookie['value'] is None: + cookie['value'] = '' + elif not isinstance(cookie['value'], str): + cookie['value'] = str(cookie['value']) + + if cookie['name'].startswith('__Host-'): + cookie['path'] = '/' + cookie['secure'] = True + + elif cookie['name'].startswith('__Secure-'): + cookie['secure'] = True + + return cookie + + +class CookiesList(list): + def as_dict(self): + """以dict格式返回,只包含name和value字段""" + return {c['name']: c['value'] for c in self} + + def as_str(self): + """以str格式返回,只包含name和value字段""" + return '; '.join([f'{c["name"]}={c["value"]}' for c in self]) + + +def _dict_cookies_to_tuple(cookies: dict): + """把dict形式的cookies转换为tuple形式 + :param cookies: 单个或多个cookies,单个时包含'name'和'value' + :return: 多个dict格式cookies组成的列表 + """ + if 'name' in cookies and 'value' in cookies: # 单个cookie + return (cookies,) + keys = ('domain', 'path', 'expires', 'max-age', 'HttpOnly', 'secure', 'expiry') + template = {k: v for k, v in cookies.items() if k in keys} + return tuple(dict(**{'name': k, 'value': v}, **template) for k, v in cookies.items() if k not in keys) diff --git a/src/flaresolverr/DrissionPage/_functions/cookies.pyi b/src/flaresolverr/DrissionPage/_functions/cookies.pyi new file mode 100644 index 0000000000..ceb729965f --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/cookies.pyi @@ -0,0 +1,44 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from http.cookiejar import Cookie +from typing import Union + +from requests import Session +from requests.cookies import RequestsCookieJar + +from .._base.browser import Chromium +from .._pages.chromium_base import ChromiumBase + + +def cookie_to_dict(cookie: Union[Cookie, str, dict]) -> dict: ... + + +def cookies_to_tuple(cookies: Union[RequestsCookieJar, list, tuple, str, dict, Cookie]) -> tuple: ... + + +def set_session_cookies(session: Session, cookies: Union[RequestsCookieJar, list, tuple, str, dict]) -> None: ... + + +def set_browser_cookies(browser: Chromium, cookies: Union[RequestsCookieJar, list, tuple, str, dict]) -> None: ... + + +def set_tab_cookies(page: ChromiumBase, cookies: Union[RequestsCookieJar, list, tuple, str, dict]) -> None: ... + + +def is_cookie_in_driver(page: ChromiumBase, cookie: dict) -> bool: ... + + +def format_cookie(cookie: dict) -> dict: ... + + +class CookiesList(list): + def as_dict(self) -> dict: ... + + def as_str(self) -> str: ... + + def __next__(self) -> dict: ... diff --git a/src/flaresolverr/DrissionPage/_functions/elements.py b/src/flaresolverr/DrissionPage/_functions/elements.py new file mode 100644 index 0000000000..51f5fd504f --- /dev/null +++ b/src/flaresolverr/DrissionPage/_functions/elements.py @@ -0,0 +1,553 @@ +# -*- coding:utf-8 -*- +""" +@Author : g1879 +@Contact : g1879@qq.com +@Copyright: (c) 2024 by g1879, Inc. All Rights Reserved. +@License : BSD 3-Clause. +""" +from time import perf_counter + +from .locator import is_loc +from .._elements.none_element import NoneElement + + +class SessionElementsList(list): + def __init__(self, page=None, *args): + super().__init__(*args) + self._page = page + + @property + def get(self): + return Getter(self) + + @property + def filter(self): + return SessionFilter(self) + + @property + def filter_one(self): + return SessionFilterOne(self) + + +class ChromiumElementsList(SessionElementsList): + + @property + def filter(self): + return ChromiumFilter(self) + + @property + def filter_one(self): + return ChromiumFilterOne(self) + + def search(self, displayed=None, checked=None, selected=None, enabled=None, clickable=None, + have_rect=None, have_text=None): + """或关系筛选元素 + :param displayed: 是否显示,bool,None为忽略该项 + :param checked: 是否被选中,bool,None为忽略该项 + :param selected: 是否被选择,bool,None为忽略该项 + :param enabled: 是否可用,bool,None为忽略该项 + :param clickable: 是否可点击,bool,None为忽略该项 + :param have_rect: 是否拥有大小和位置,bool,None为忽略该项 + :param have_text: 是否含有文本,bool,None为忽略该项 + :return: 筛选结果 + """ + return _search(self, displayed=displayed, checked=checked, selected=selected, enabled=enabled, + clickable=clickable, have_rect=have_rect, have_text=have_text) + + def search_one(self, index=1, displayed=None, checked=None, selected=None, enabled=None, clickable=None, + have_rect=None, have_text=None): + """或关系筛选元素,获取一个结果 + :param index: 元素序号,从1开始 + :param displayed: 是否显示,bool,None为忽略该项 + :param checked: 是否被选中,bool,None为忽略该项 + :param selected: 是否被选择,bool,None为忽略该项 + :param enabled: 是否可用,bool,None为忽略该项 + :param clickable: 是否可点击,bool,None为忽略该项 + :param have_rect: 是否拥有大小和位置,bool,None为忽略该项 + :param have_text: 是否含有文本,bool,None为忽略该项 + :return: 筛选结果 + """ + return _search_one(self, index=index, displayed=displayed, checked=checked, selected=selected, + enabled=enabled, clickable=clickable, have_rect=have_rect, have_text=have_text) + + +class SessionFilterOne(object): + def __init__(self, _list): + self._list = _list + self._index = 1 + + def __call__(self, index=1): + """返回结果中第几个元素 + :param index: 元素序号,从1开始 + :return: 对象自身 + """ + self._index = index + return self + + def attr(self, name, value, equal=True): + """以是否拥有某个attribute值为条件筛选元素 + :param name: 属性名称 + :param value: 属性值 + :param equal: True表示匹配name值为value值的元素,False表示匹配name值不为value值的 + :return: 筛选结果 + """ + return self._get_attr(name, value, 'attr', equal=equal) + + def text(self, text, fuzzy=True, contain=True): + """以是否含有指定文本为条件筛选元素 + :param text: 用于匹配的文本 + :param fuzzy: 是否模糊匹配 + :param contain: 是否包含该字符串,False表示不包含 + :return: 筛选结果 + """ + num = 0 + if contain: + for i in self._list: + t = i if isinstance(i, str) else i.raw_text + if (fuzzy and text in t) or (not fuzzy and text == t): + num += 1 + if self._index == num: + return i + else: + for i in self._list: + t = i if isinstance(i, str) else i.raw_text + if (fuzzy and text not in t) or (not fuzzy and text != t): + num += 1 + if self._index == num: + return i + return NoneElement(self._list._page, 'text()', + args={'text': text, 'fuzzy': fuzzy, 'contain': contain, 'index': self._index}) + + def _get_attr(self, name, value, method, equal=True): + """返回通过某个方法可获得某个值的元素 + :param name: 属性名称 + :param value: 属性值 + :param method: 方法名称 + :return: 筛选结果 + """ + num = 0 + if equal: + for i in self._list: + if not isinstance(i, str) and getattr(i, method)(name) == value: + num += 1 + if self._index == num: + return i + else: + for i in self._list: + if not isinstance(i, str) and getattr(i, method)(name) != value: + num += 1 + if self._index == num: + return i + return NoneElement(self._list._page, f'{method}()', + args={'name': name, 'value': value, 'equal': equal, 'index': self._index}) + + +class SessionFilter(SessionFilterOne): + + def __iter__(self): + return iter(self._list) + + def __next__(self): + return next(self._list) + + def __len__(self): + return len(self._list) + + def __getitem__(self, item): + return self._list[item] + + @property + def get(self): + """返回用于获取元素属性的对象""" + return self._list.get + + def text(self, text, fuzzy=True, contain=True): + """以是否含有指定文本为条件筛选元素 + :param text: 用于匹配的文本 + :param fuzzy: 是否模糊匹配 + :param contain: 是否包含该字符串,False表示不包含 + :return: 筛选结果 + """ + self._list = _text_all(self._list, SessionElementsList(page=self._list._page), + text=text, fuzzy=fuzzy, contain=contain) + return self + + def _get_attr(self, name, value, method, equal=True): + """返回通过某个方法可获得某个值的元素 + :param name: 属性名称 + :param value: 属性值 + :param method: 方法名称 + :return: 筛选结果 + """ + self._list = _get_attr_all(self._list, SessionElementsList(page=self._list._page), + name=name, value=value, method=method, equal=equal) + return self + + +class ChromiumFilterOne(SessionFilterOne): + + def displayed(self, equal=True): + """以是否显示为条件筛选元素 + :param equal: 是否匹配显示的元素,False匹配不显示的 + :return: 筛选结果 + """ + return self._any_state('is_displayed', equal=equal) + + def checked(self, equal=True): + """以是否被选中为条件筛选元素 + :param equal: 是否匹配被选中的元素,False匹配不被选中的 + :return: 筛选结果 + """ + return self._any_state('is_checked', equal=equal) + + def selected(self, equal=True): + """以是否被选择为条件筛选元素,用于