From ce666bcd542962a612c3d0fc9e09d1eb0ae8f00a Mon Sep 17 00:00:00 2001 From: DavHau Date: Sat, 15 Aug 2020 19:09:28 +0700 Subject: [PATCH] init pypi-crawlers (moved from github.com/DavHau/pypi-crawlers) --- pypi-crawlers/.envrc | 1 + pypi-crawlers/.gitignore | 5 + pypi-crawlers/LICENSE | 9 + pypi-crawlers/Readme.md | 49 ++++ pypi-crawlers/debug/Readme.md | 5 + pypi-crawlers/debug/setuptools_call.py | 9 + pypi-crawlers/env.example | 1 + pypi-crawlers/nix/crawler/configuration.nix | 217 ++++++++++++++++ pypi-crawlers/nix/crawler/github_pub_key | 1 + pypi-crawlers/nix/crawler/hetzner.nix | 22 ++ .../nix/crawler/keys.example/crawler_ssh_key | 3 + .../crawler/keys.example/crawler_ssh_key.pub | 1 + .../nix/crawler/keys.example/db_pass | 1 + .../crawler/keys.example/id_ed25519_deps_db | 3 + .../keys.example/id_ed25519_deps_db.pub | 1 + pypi-crawlers/nix/database/configuration.nix | 65 +++++ pypi-crawlers/nix/database/hetzner.nix | 20 ++ pypi-crawlers/nix/nixpkgs-src.nix | 8 + .../nix/power-deps-crawler/configuration.nix | 80 ++++++ .../nix/power-deps-crawler/genesis.nix | 25 ++ .../nix/power-deps-crawler/github_pub_key | 1 + .../power-wheels-crawler/configuration.nix | 140 ++++++++++ .../nix/power-wheels-crawler/genesis.nix | 25 ++ .../nix/power-wheels-crawler/github_pub_key | 1 + .../power-wheels-crawler/hetzner-cloud.nix | 19 ++ pypi-crawlers/nix/python.nix | 16 ++ pypi-crawlers/shell.nix | 18 ++ pypi-crawlers/src/bucket_dict.py | 104 ++++++++ pypi-crawlers/src/crawl_sdist_deps.py | 242 ++++++++++++++++++ pypi-crawlers/src/crawl_urls.py | 113 ++++++++ pypi-crawlers/src/crawl_wheel_deps.py | 214 ++++++++++++++++ pypi-crawlers/src/db.py | 54 ++++ pypi-crawlers/src/dump_sdist_deps.py | 147 +++++++++++ pypi-crawlers/src/extractor/default.nix | 132 ++++++++++ pypi-crawlers/src/extractor/distutils.patch | 55 ++++ pypi-crawlers/src/extractor/extractor.nix | 1 + .../src/extractor/fast-extractor.nix | 1 + pypi-crawlers/src/extractor/setuptools.patch | 13 + pypi-crawlers/src/utils.py | 11 + 39 files changed, 1833 insertions(+) create mode 100644 pypi-crawlers/.envrc create mode 100644 pypi-crawlers/.gitignore create mode 100644 pypi-crawlers/LICENSE create mode 100644 pypi-crawlers/Readme.md create mode 100644 pypi-crawlers/debug/Readme.md create mode 100644 pypi-crawlers/debug/setuptools_call.py create mode 100644 pypi-crawlers/env.example create mode 100644 pypi-crawlers/nix/crawler/configuration.nix create mode 100644 pypi-crawlers/nix/crawler/github_pub_key create mode 100644 pypi-crawlers/nix/crawler/hetzner.nix create mode 100644 pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key create mode 100644 pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key.pub create mode 100644 pypi-crawlers/nix/crawler/keys.example/db_pass create mode 100644 pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db create mode 100644 pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db.pub create mode 100644 pypi-crawlers/nix/database/configuration.nix create mode 100644 pypi-crawlers/nix/database/hetzner.nix create mode 100644 pypi-crawlers/nix/nixpkgs-src.nix create mode 100644 pypi-crawlers/nix/power-deps-crawler/configuration.nix create mode 100644 pypi-crawlers/nix/power-deps-crawler/genesis.nix create mode 100644 pypi-crawlers/nix/power-deps-crawler/github_pub_key create mode 100644 pypi-crawlers/nix/power-wheels-crawler/configuration.nix create mode 100644 pypi-crawlers/nix/power-wheels-crawler/genesis.nix create mode 100644 pypi-crawlers/nix/power-wheels-crawler/github_pub_key create mode 100644 pypi-crawlers/nix/power-wheels-crawler/hetzner-cloud.nix create mode 100644 pypi-crawlers/nix/python.nix create mode 100644 pypi-crawlers/shell.nix create mode 100644 pypi-crawlers/src/bucket_dict.py create mode 100644 pypi-crawlers/src/crawl_sdist_deps.py create mode 100644 pypi-crawlers/src/crawl_urls.py create mode 100644 pypi-crawlers/src/crawl_wheel_deps.py create mode 100644 pypi-crawlers/src/db.py create mode 100644 pypi-crawlers/src/dump_sdist_deps.py create mode 100644 pypi-crawlers/src/extractor/default.nix create mode 100644 pypi-crawlers/src/extractor/distutils.patch create mode 100644 pypi-crawlers/src/extractor/extractor.nix create mode 100644 pypi-crawlers/src/extractor/fast-extractor.nix create mode 100644 pypi-crawlers/src/extractor/setuptools.patch create mode 100644 pypi-crawlers/src/utils.py diff --git a/pypi-crawlers/.envrc b/pypi-crawlers/.envrc new file mode 100644 index 0000000..1d953f4 --- /dev/null +++ b/pypi-crawlers/.envrc @@ -0,0 +1 @@ +use nix diff --git a/pypi-crawlers/.gitignore b/pypi-crawlers/.gitignore new file mode 100644 index 0000000..dcb3ae0 --- /dev/null +++ b/pypi-crawlers/.gitignore @@ -0,0 +1,5 @@ +.idea/ +**/keys/ +*.pyc +**/.*/ +env diff --git a/pypi-crawlers/LICENSE b/pypi-crawlers/LICENSE new file mode 100644 index 0000000..d8224fb --- /dev/null +++ b/pypi-crawlers/LICENSE @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2020 David Hauer + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/pypi-crawlers/Readme.md b/pypi-crawlers/Readme.md new file mode 100644 index 0000000..c40d90b --- /dev/null +++ b/pypi-crawlers/Readme.md @@ -0,0 +1,49 @@ +## A set of tools to index python package URL's and dependencies + +These crawlers have been created for the purpose of maintaining data required for [mach-nix](https://github.com/DavHau/mach-nix). + +This project contains 2 crawlers. One which indexes available `sdist` python package downloads from pypi.org and another one which actually downloads all packages and extracts their dependency information. + +The URL index is stored here: [nix-pypi-fetcher](https://github.com/DavHau/nix-pypi-fetcher) (which at the same time is a convenient standalone pypi fetcher for nix) +The dependency graph is stored here: [pypi-deps-db](https://github.com/DavHau/pypi-deps-db) + +--- +## URL Crawler +It takes the complete list of packages from pypi's xml API, then retrieves the download URLs for each package via pypi's json [API](https://warehouse.readthedocs.io/api-reference/json/). +The sha256 hashes are already returned by this API. No package downloading needed to build this index. + +--- +## Dependency Crawler +It doesn't seem like pypi provides any information about dependencies via API. A package's dependencies only get revealed during the installation process itself. Therefore the strategy to extract one package's dependencies is: +1. Download and extract the sdist release of the package +2. Run the package's setup routine through a modified python environment which doesn't do a real `setup` but instead just dumps the packages dependencies. + +The main modifications that needed to be done to python to extract the dependencies are: + - Patch the builtin module `distutils` to run the setup routine until all of the important information gathering is finished, then jsonify some relevant arguments and dump them to a file. + - Patch `setuptools` to skip the installation of setup requirements and directly call the setup method of `distutils`. + +The process to extract requirements for a single package is defined as a nix derivation under `./src/extractor/`. +This allows to run the extraction process as a nix builder in a sandboxed environment. +This extractor derivation takes one python package's downlaod information as input and produces a json output containing the dependencies. +A python based service regularly checks for new packages which werer detected by the URL crawler and runs those packages through the `extractor` builder to update the dependency DB. Afterwards this database is dumped to json and published at [pypi-deps-db](https://github.com/DavHau/pypi-deps-db). + +--- +### Project Structure +``` +|- nix/ Contains NixOps deployments for the crawler and database +| |- crawler/ Deployment for both crawlers together on a small machine +| |- database/ Deployment for the DB needed to store dependency information +| |- power-deps-crawler Alternative deployment of the dependency crawler on a strong +| machine which was needed to process the complete past history +| of python packages. +| +|- src/ + |- extractor/ Nix expression for extracting a single package + |- crawl_deps.py Entry point for the dependency crawler + |- crawl_urls Entry point for the URL crawler + |- dump_deps.py Entry point for dumping the dependencies from the DB into json. + +``` + +### Debugging +see [./debug](./debug) diff --git a/pypi-crawlers/debug/Readme.md b/pypi-crawlers/debug/Readme.md new file mode 100644 index 0000000..b3c82d5 --- /dev/null +++ b/pypi-crawlers/debug/Readme.md @@ -0,0 +1,5 @@ +### Debugging dependency extraction for a single package +- Extraction uses a modified python interpreter which must also be used for debugging. It is available from ./src/extractor/default.nix as attributes (py27, py35, py36, ...). Either build it via `nix-build ./src/extractor -A py37` for example or use `nix-shell`. +- Find the sdist release of the package you want to debug on pypi.org, download and unpack it. +- Copy the `./debug/setuptools_call.py` of this project to the unpacked source to the same directory the setup.py is in. Execute `setuptools_call.py` from there. +- You can use a debugger and set breakpoints in `setup.py`. diff --git a/pypi-crawlers/debug/setuptools_call.py b/pypi-crawlers/debug/setuptools_call.py new file mode 100644 index 0000000..e78623d --- /dev/null +++ b/pypi-crawlers/debug/setuptools_call.py @@ -0,0 +1,9 @@ +import sys, setuptools, tokenize + +sys.argv[0] = 'setup.py' +sys.argv[1] = 'install' +__file__='setup.py' +f=getattr(tokenize, 'open', open)(__file__) +code=f.read().replace('\r\n', '\n') +f.close() +exec(compile(code, __file__, 'exec')) \ No newline at end of file diff --git a/pypi-crawlers/env.example b/pypi-crawlers/env.example new file mode 100644 index 0000000..12ba3fb --- /dev/null +++ b/pypi-crawlers/env.example @@ -0,0 +1 @@ +export EMAIL= \ No newline at end of file diff --git a/pypi-crawlers/nix/crawler/configuration.nix b/pypi-crawlers/nix/crawler/configuration.nix new file mode 100644 index 0000000..3d479d0 --- /dev/null +++ b/pypi-crawlers/nix/crawler/configuration.nix @@ -0,0 +1,217 @@ +{ config, pkgs, ...}: +let + python = (import ../python.nix); + user = "crawler"; + src = "${../../src}"; + nixpkgs_src = (import ../../nix/nixpkgs-src.nix).stable; + db_host = "10.147.19.69"; + extractor = import ../../src/extractor; + branch = "master"; + enable = true; + serviceConfig = { + Type = "simple"; + User = "${user}"; + RuntimeMaxSec = 60 * 60 * 10; # 10h + }; + cd_into_updated_proj_branch = name: dir: branch: email: '' + if [ ! -e /home/${user}/${dir} ]; then + git clone git@github.com:DavHau/${name}.git /home/${user}/${dir} + cd /home/${user}/${dir} + git config user.email "${email}" + git config user.name "DavHau-bot" + fi + cd /home/${user}/${dir} + git fetch --all + git checkout ${branch} + git pull + ''; +in +{ + deployment.keys = { + db_pass = { + keyFile = ./keys/db_pass; + destDir = "/home/${user}/"; + user = "${user}"; + }; + id_ed25519 = { + keyFile = ./keys/crawler_ssh_key; + destDir = "/home/${user}/.ssh/"; + user = "${user}"; + }; + id_ed25519_deps_db = { + keyFile = ./keys/id_ed25519_deps_db; + destDir = "/home/${user}/.ssh/"; + user = "${user}"; + }; + }; + swapDevices = [{ + size = 10000; + device = "/tmp/swapfile"; + }]; + nix.nixPath = [ "nixpkgs=${nixpkgs_src}" ]; + services.journald.extraConfig = '' + SystemMaxUse=1G + ''; + nixpkgs.config.allowUnfree = true; + environment.systemPackages = [ + python + pkgs.htop + pkgs.vim + pkgs.bmon + extractor.py27 + extractor.py35 + extractor.py36 + extractor.py37 + extractor.py38 + ]; + nix.maxJobs = 2; + nix.extraOptions = '' + http-connections = 300 + #keep-env-derivations = true + keep-outputs = true + ''; + services.zerotierone.enable = true; + services.zerotierone.joinNetworks = ["93afae59636cb8e3"]; # db network + users = { + mutableUsers = false; + users."${user}" = { + home = "/home/${user}"; + createHome = true; + }; + }; + programs.ssh.knownHosts = { + github = { + hostNames = [ "github.com" "13.229.188.59" ]; + publicKeyFile = "${./github_pub_key}"; + }; + }; + system.activationScripts = { + ssh_dir = { + text = '' + chown -R crawler /home/crawler/.ssh + ''; + deps = []; + }; + }; + systemd.services.crawl-urls = + let + environment = { + WORKERS = "5"; + PYTHONPATH = src; + EMAIL = "hsngrmpf+pypiurlcrawler@gmail.com"; + }; + in + { + inherit serviceConfig environment; + description = "Crawl PyPi URLs"; + after = [ "network-online.target" ]; + path = [ python pkgs.git ]; + script = with environment; '' + set -x + ${cd_into_updated_proj_branch "nix-pypi-fetcher" "nix-pypi-fetcher_update" "${branch}" EMAIL} + rm -f ./pypi/* + ${python}/bin/python -u ${src}/crawl_urls.py ./pypi + echo $(date +%s) > UNIX_TIMESTAMP + git add ./pypi UNIX_TIMESTAMP + git pull + git commit -m "$(date)" + git push + ''; + }; + systemd.timers.crawl-urls = { + inherit enable; + wantedBy = [ "timers.target" ]; + partOf = [ "crawl-urls.service" ]; + timerConfig.OnCalendar = "00/12:00"; # at 00:00 and 12:00 + }; + systemd.services.crawl-sdist = + let + environment = { + WORKERS = "5"; + PYTHONPATH = src; + NIX_PATH = "nixpkgs=${nixpkgs_src}"; + DB_HOST = db_host; + EMAIL = "hsngrmpf+pypidepscrawler@gmail.com"; + CLEANUP = "y"; + pypi_fetcher = "/home/${user}/nix-pypi-fetcher"; + }; + in + { + inherit serviceConfig environment; + description = "Crawl PyPi Sdist Deps and push to github"; + after = [ "network-online.target" ]; + path = [ python ] ++ (with pkgs; [ git nix gawk gnutar gzip ]); + script = with environment; '' + export DB_PASS=$(cat /home/${user}/db_pass) + ${python}/bin/python -u ${src}/crawl_sdist_deps.py + set -x + export GIT_SSH_COMMAND="${pkgs.openssh}/bin/ssh -i /home/${user}/.ssh/id_ed25519_deps_db" + ${cd_into_updated_proj_branch "nix-pypi-fetcher" "nix-pypi-fetcher" "${branch}" EMAIL} + ${cd_into_updated_proj_branch "pypi-deps-db" "pypi-deps-db" "${branch}" EMAIL} + rm -f ./sdist/* + ${python}/bin/python -u ${src}/dump_sdist_deps.py ./sdist + echo $(date +%s) > UNIX_TIMESTAMP + pypi_fetcher_commit=$(git ls-remote https://github.com/DavHau/nix-pypi-fetcher ${branch} | awk '{print $1;}') + pypi_fetcher_url="https://github.com/DavHau/nix-pypi-fetcher/archive/''${pypi_fetcher_commit}.tar.gz" + pypi_fetcher_hash=$(nix-prefetch-url --unpack $pypi_fetcher_url) + echo $pypi_fetcher_commit > PYPI_FETCHER_COMMIT + echo $pypi_fetcher_hash > PYPI_FETCHER_SHA256 + git add ./sdist UNIX_TIMESTAMP PYPI_FETCHER_COMMIT PYPI_FETCHER_SHA256 + git pull + git commit -m "$(date) - sdist_update" + git push + ''; + }; + systemd.timers.crawl-sdist = { + inherit enable; + wantedBy = [ "timers.target" ]; + partOf = [ "crawl-deps.service" ]; + timerConfig.OnCalendar = [ + "Mon-Sun *-*-* 4:00:00" + "Mon-Sun *-*-* 16:00:00" + ]; + }; + systemd.services.crawl-wheel = + let + environment = { + WORKERS = "5"; + PYTHONPATH = src; + EMAIL = "hsngrmpf+pypidepscrawler@gmail.com"; + pypi_fetcher = "/home/${user}/nix-pypi-fetcher"; + dump_dir = "/home/${user}/pypi-deps-db/wheel"; + }; + in + { + inherit serviceConfig environment; + description = "Crawl Pypi Wheel Deps and push to gitub"; + after = [ "network-online.target" ]; + path = [ python ] ++ (with pkgs; [ git nix gawk gnutar gzip ]); + script = with environment; '' + set -x + export GIT_SSH_COMMAND="${pkgs.openssh}/bin/ssh -i /home/${user}/.ssh/id_ed25519_deps_db" + ${cd_into_updated_proj_branch "nix-pypi-fetcher" "nix-pypi-fetcher" "${branch}" EMAIL} + ${cd_into_updated_proj_branch "pypi-deps-db" "pypi-deps-db" "${branch}" EMAIL} + export PYTONPATH=${src} + ${python}/bin/python -u ${src}/crawl_wheel_deps.py $dump_dir + echo $(date +%s) > UNIX_TIMESTAMP + pypi_fetcher_commit=$(git ls-remote https://github.com/DavHau/nix-pypi-fetcher ${branch} | awk '{print $1;}') + pypi_fetcher_url="https://github.com/DavHau/nix-pypi-fetcher/archive/''${pypi_fetcher_commit}.tar.gz" + pypi_fetcher_hash=$(nix-prefetch-url --unpack $pypi_fetcher_url) + echo $pypi_fetcher_commit > PYPI_FETCHER_COMMIT + echo $pypi_fetcher_hash > PYPI_FETCHER_SHA256 + git add ./wheel UNIX_TIMESTAMP PYPI_FETCHER_COMMIT PYPI_FETCHER_SHA256 + git pull + git commit -m "$(date) - wheel_update" + git push + ''; + }; + systemd.timers.crawl-wheel = { + inherit enable; + wantedBy = [ "timers.target" ]; + partOf = [ "dump-deps.service" ]; + timerConfig.OnCalendar = [ + "Mon-Sun *-*-* 8:00:00" + "Mon-Sun *-*-* 20:00:00" + ]; + }; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/crawler/github_pub_key b/pypi-crawlers/nix/crawler/github_pub_key new file mode 100644 index 0000000..66dd994 --- /dev/null +++ b/pypi-crawlers/nix/crawler/github_pub_key @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ== \ No newline at end of file diff --git a/pypi-crawlers/nix/crawler/hetzner.nix b/pypi-crawlers/nix/crawler/hetzner.nix new file mode 100644 index 0000000..e726424 --- /dev/null +++ b/pypi-crawlers/nix/crawler/hetzner.nix @@ -0,0 +1,22 @@ +{ + network.description = "Pypi Crawler"; + network.enableRollback = true; + + machine = + { config, pkgs, ... }: + { imports = [ + + ./configuration.nix + ]; + boot.loader.grub.device = "/dev/sda"; + fileSystems."/" = { device = "/dev/sda1"; fsType = "ext4"; }; + boot.cleanTmpDir = true; + networking.hostName = "pypi-crawler"; + networking.firewall.allowPing = true; + services.openssh.enable = true; + users.users.root.openssh.authorizedKeys.keys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDuhpzDHBPvn8nv8RH1MRomDOaXyP4GziQm7r3MZ1Syk" + ]; + deployment.targetHost = "95.217.166.31"; + }; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key b/pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key new file mode 100644 index 0000000..3b98433 --- /dev/null +++ b/pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key @@ -0,0 +1,3 @@ +-----BEGIN OPENSSH PRIVATE KEY----- +EXAMPLE +-----END OPENSSH PRIVATE KEY----- diff --git a/pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key.pub b/pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key.pub new file mode 100644 index 0000000..08a2e06 --- /dev/null +++ b/pypi-crawlers/nix/crawler/keys.example/crawler_ssh_key.pub @@ -0,0 +1 @@ +ssh-ed25519 ... diff --git a/pypi-crawlers/nix/crawler/keys.example/db_pass b/pypi-crawlers/nix/crawler/keys.example/db_pass new file mode 100644 index 0000000..4111a67 --- /dev/null +++ b/pypi-crawlers/nix/crawler/keys.example/db_pass @@ -0,0 +1 @@ +db_password \ No newline at end of file diff --git a/pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db b/pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db new file mode 100644 index 0000000..2617fef --- /dev/null +++ b/pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db @@ -0,0 +1,3 @@ +-----BEGIN OPENSSH PRIVATE KEY----- +Another Key +-----END OPENSSH PRIVATE KEY----- diff --git a/pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db.pub b/pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db.pub new file mode 100644 index 0000000..08a2e06 --- /dev/null +++ b/pypi-crawlers/nix/crawler/keys.example/id_ed25519_deps_db.pub @@ -0,0 +1 @@ +ssh-ed25519 ... diff --git a/pypi-crawlers/nix/database/configuration.nix b/pypi-crawlers/nix/database/configuration.nix new file mode 100644 index 0000000..7fc7295 --- /dev/null +++ b/pypi-crawlers/nix/database/configuration.nix @@ -0,0 +1,65 @@ +with import (import ../nixpkgs-src.nix).stable {}; +{ config, pkgs, nodes, ... }: +{ environment.systemPackages = [ + bmon htop + screen + git + lz4 + ]; + deployment.keys = { + initial_script = { + keyFile = ./keys/initial_script; + destDir = "/keys"; + }; + }; + services.postgresql = { + enable = true; + package = pkgs.postgresql_11; + enableTCPIP = true; + authentication = pkgs.lib.mkOverride 10 '' + local all all ident + #host all all ::1/128 md5 + host all all 0.0.0.0/0 password + ''; + ensureDatabases = [ "almighty" "test" ]; + ensureUsers = [ + { + name = "almighty"; + ensurePermissions = { + "DATABASE almighty" = "ALL PRIVILEGES"; + "DATABASE test" = "ALL PRIVILEGES"; + }; + } + { + name = "root"; + ensurePermissions = { + "ALL TABLES IN SCHEMA public" = "ALL PRIVILEGES"; + }; + } + ]; + initialScript = "/keys/initial_script"; + extraConfig = '' + max_connections = 20 + shared_buffers = 768MB + effective_cache_size = 2304MB + maintenance_work_mem = 192MB + checkpoint_completion_target = 0.9 + wal_buffers = 16MB + default_statistics_target = 100 + random_page_cost = 1.1 + effective_io_concurrency = 200 + work_mem = 19660kB + min_wal_size = 1GB + max_wal_size = 4GB + max_worker_processes = 2 + max_parallel_workers_per_gather = 1 + max_parallel_workers = 2 + max_parallel_maintenance_workers = 1 + ''; + }; + nixpkgs.config.allowUnfree = true; + services.zerotierone.enable = true; + services.zerotierone.joinNetworks = ["93afae59636cb8e3"]; + users.users.postgres.hashedPassword = "$6$JdTxB0NOfAXl$oKWTqnPuE67WikhRHNM3r/.fef2NEIZeEybkEJnkL8D0jh65YwsdlwKC86ig6VK1EuA6R4UARFVwCTdTk7npk/"; + networking.firewall.allowedTCPPorts = [ 5432 ]; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/database/hetzner.nix b/pypi-crawlers/nix/database/hetzner.nix new file mode 100644 index 0000000..5fe1fdf --- /dev/null +++ b/pypi-crawlers/nix/database/hetzner.nix @@ -0,0 +1,20 @@ +{ + network.description = "Almighty"; + machine = + { config, pkgs, ... }: + { imports = [ + + ./configuration.nix + ]; + boot.loader.grub.device = "/dev/sda"; + fileSystems."/" = { device = "/dev/sda1"; fsType = "ext4"; }; + boot.cleanTmpDir = true; + networking.hostName = "almighty-db"; + networking.firewall.allowPing = true; + services.openssh.enable = true; + users.users.root.openssh.authorizedKeys.keys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDuhpzDHBPvn8nv8RH1MRomDOaXyP4GziQm7r3MZ1Syk" + ]; + deployment.targetHost = "95.216.192.49"; + }; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/nixpkgs-src.nix b/pypi-crawlers/nix/nixpkgs-src.nix new file mode 100644 index 0000000..8b43003 --- /dev/null +++ b/pypi-crawlers/nix/nixpkgs-src.nix @@ -0,0 +1,8 @@ +rec { + stable = builtins.fetchGit { + name = "nixpkgs"; + url = "https://github.com/nixos/nixpkgs-channels/"; + ref = "refs/heads/nixos-20.03"; + rev = "0a40a3999eb4d577418515da842a2622a64880c5"; + }; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/power-deps-crawler/configuration.nix b/pypi-crawlers/nix/power-deps-crawler/configuration.nix new file mode 100644 index 0000000..e3d864f --- /dev/null +++ b/pypi-crawlers/nix/power-deps-crawler/configuration.nix @@ -0,0 +1,80 @@ +{ config, pkgs, nodes, ... }: +let + python = (import ../python.nix); + user = "crawler"; + src = "${../../src}"; + nixpkgs_src = "${../nixpkgs-src.nix}"; +in +{ + deployment.keys = { + db_pass = { + keyFile = ./keys/db_pass; + destDir = "/home/${user}/"; + user = "${user}"; + }; + id_ed25519 = { + keyFile = ./keys/crawler_ssh_key; + destDir = "/home/${user}/.ssh/"; + user = "${user}"; + }; + }; + environment.systemPackages = with pkgs; [ + bmon htop + screen + git + vim + lz4 + ]; + nix.maxJobs = 100; + nix.extraOptions = '' + #use-sqlite-wal = false + http-connections = 300 + keep-env-derivations = true + keep-outputs = true + ''; + + fileSystems."/tmp-store" = + { fsType = "tmpfs"; + options = [ "size=50%" ]; + }; + users = { + mutableUsers = false; + users."${user}" = { + home = "/home/${user}"; + createHome = true; + }; + }; + programs.ssh.knownHosts = { + github = { + hostNames = [ "github.com" "13.229.188.59" ]; + publicKeyFile = "${./github_pub_key}"; + }; + }; + systemd.services.crawl-deps = { + description = "Crawl PyPi Deps"; + after = [ "network-online.target" ]; + serviceConfig = { Type = "simple"; }; + serviceConfig = { User = "${user}"; }; + environment = { + WORKERS = "60"; + PYTHONPATH = src; + NIXPKGS_SRC = nixpkgs_src; + almighty_cleanup = "y"; + almighty_store = "/tmp-store"; + almighty_workers = "60"; + }; + path = [ python pkgs.git pkgs.nix pkgs.gnutar]; + script = '' + export DB_PASS=$(cat /home/${user}/db_pass) + ${python}/bin/python -u ${src}/crawl_deps.py + ''; + }; + system.activationScripts = { + ssh_dir = { + text = '' + chown -R crawler /home/crawler/.ssh + ''; + deps = []; + }; + }; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/power-deps-crawler/genesis.nix b/pypi-crawlers/nix/power-deps-crawler/genesis.nix new file mode 100644 index 0000000..b313428 --- /dev/null +++ b/pypi-crawlers/nix/power-deps-crawler/genesis.nix @@ -0,0 +1,25 @@ +{ + machine = + { config, pkgs, nodes, ... }: + { + imports = [ + + ./configuration.nix + ]; + + deployment.targetHost = "194.61.20.239"; + + boot.cleanTmpDir = true; + networking.hostName = "nixos"; + networking.firewall.allowPing = true; + services.openssh.enable = true; + services.openssh.forwardX11 = true; + services.openssh.passwordAuthentication = false; + users.users.root.openssh.authorizedKeys.keys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDuhpzDHBPvn8nv8RH1MRomDOaXyP4GziQm7r3MZ1Syk" + ]; + + boot.loader.grub.device = "/dev/vda"; + fileSystems."/" = { device = "/dev/mapper/gc--vg-root"; fsType = "ext4"; }; + }; +} diff --git a/pypi-crawlers/nix/power-deps-crawler/github_pub_key b/pypi-crawlers/nix/power-deps-crawler/github_pub_key new file mode 100644 index 0000000..66dd994 --- /dev/null +++ b/pypi-crawlers/nix/power-deps-crawler/github_pub_key @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ== \ No newline at end of file diff --git a/pypi-crawlers/nix/power-wheels-crawler/configuration.nix b/pypi-crawlers/nix/power-wheels-crawler/configuration.nix new file mode 100644 index 0000000..5cee719 --- /dev/null +++ b/pypi-crawlers/nix/power-wheels-crawler/configuration.nix @@ -0,0 +1,140 @@ +{ config, pkgs, nodes, ... }: +let + python = (import ../python.nix); + user = "crawler"; + src = "${../../src}"; +in +{ + swapDevices = [ + { + size = 150000; + device = "/tmp/swapfile"; + } + #{ + # size = 50000; + # device = "/tmp/swapfile2"; + #} + ]; + environment.systemPackages = with pkgs; [ + bmon htop + screen + git + vim + lz4 + ]; + users = { + mutableUsers = false; + users."${user}" = { + home = "/home/${user}"; + createHome = true; + }; + }; + systemd.services.crawl-deps = { + description = "Crawl PyPi Deps for wheels"; + after = [ "network-online.target" ]; + serviceConfig = { Type = "simple"; }; + serviceConfig = { User = "${user}"; }; + environment = { + WORKERS = "100"; + PYTHONPATH = src; + EMAIL = "hsngrmpf+pypidepscrawler@gmail.com"; + pypi_fetcher = "/home/${user}/nix-pypi-fetcher"; + dump_dir = "/home/${user}/wheels"; + #skip = "21"; + }; + path = [ python pkgs.git ]; + script = '' + if [ ! -e /home/${user}/nix-pypi-fetcher ]; then + git clone --single-branch --branch wheels https://github.com/DavHau/nix-pypi-fetcher.git /home/${user}/nix-pypi-fetcher + cd /home/${user}/nix-pypi-fetcher + git config user.email "$EMAIL" + git config user.name "DavHau" + fi + cd /home/${user}/nix-pypi-fetcher + #git checkout wheels + #git pull + ${python}/bin/python -u ${src}/wheel_deps_spider.py + ''; + }; + systemd.services.crawl-deps2 = { + description = "Crawl PyPi Deps for wheels 2"; + after = [ "network-online.target" ]; + serviceConfig = { Type = "simple"; }; + serviceConfig = { User = "${user}"; }; + environment = { + WORKERS = "100"; + PYTHONPATH = src; + EMAIL = "hsngrmpf+pypidepscrawler@gmail.com"; + pypi_fetcher = "/home/${user}/nix-pypi-fetcher"; + dump_dir = "/home/${user}/wheels"; + skip = "40"; + }; + path = [ python pkgs.git ]; + script = '' + if [ ! -e /home/${user}/nix-pypi-fetcher ]; then + git clone --single-branch --branch wheels https://github.com/DavHau/nix-pypi-fetcher.git /home/${user}/nix-pypi-fetcher + cd /home/${user}/nix-pypi-fetcher + git config user.email "$EMAIL" + git config user.name "DavHau" + fi + cd /home/${user}/nix-pypi-fetcher + #git checkout wheels + #git pull + ${python}/bin/python -u ${src}/wheel_deps_spider.py + ''; + }; + systemd.services.crawl-deps3 = { + description = "Crawl PyPi Deps for wheels 3"; + after = [ "network-online.target" ]; + serviceConfig = { Type = "simple"; }; + serviceConfig = { User = "${user}"; }; + environment = { + WORKERS = "100"; + PYTHONPATH = src; + EMAIL = "hsngrmpf+pypidepscrawler@gmail.com"; + pypi_fetcher = "/home/${user}/nix-pypi-fetcher"; + dump_dir = "/home/${user}/wheels"; + skip = "80"; + }; + path = [ python pkgs.git ]; + script = '' + if [ ! -e /home/${user}/nix-pypi-fetcher ]; then + git clone --single-branch --branch wheels https://github.com/DavHau/nix-pypi-fetcher.git /home/${user}/nix-pypi-fetcher + cd /home/${user}/nix-pypi-fetcher + git config user.email "$EMAIL" + git config user.name "DavHau" + fi + cd /home/${user}/nix-pypi-fetcher + #git checkout wheels + #git pull + ${python}/bin/python -u ${src}/wheel_deps_spider.py + ''; + }; + systemd.services.crawl-deps4 = { + description = "Crawl PyPi Deps for wheels 4"; + after = [ "network-online.target" ]; + serviceConfig = { Type = "simple"; }; + serviceConfig = { User = "${user}"; }; + environment = { + WORKERS = "100"; + PYTHONPATH = src; + EMAIL = "hsngrmpf+pypidepscrawler@gmail.com"; + pypi_fetcher = "/home/${user}/nix-pypi-fetcher"; + dump_dir = "/home/${user}/wheels"; + skip = "c0"; + }; + path = [ python pkgs.git ]; + script = '' + if [ ! -e /home/${user}/nix-pypi-fetcher ]; then + git clone --single-branch --branch wheels https://github.com/DavHau/nix-pypi-fetcher.git /home/${user}/nix-pypi-fetcher + cd /home/${user}/nix-pypi-fetcher + git config user.email "$EMAIL" + git config user.name "DavHau" + fi + cd /home/${user}/nix-pypi-fetcher + #git checkout wheels + #git pull + ${python}/bin/python -u ${src}/wheel_deps_spider.py + ''; + }; +} \ No newline at end of file diff --git a/pypi-crawlers/nix/power-wheels-crawler/genesis.nix b/pypi-crawlers/nix/power-wheels-crawler/genesis.nix new file mode 100644 index 0000000..b313428 --- /dev/null +++ b/pypi-crawlers/nix/power-wheels-crawler/genesis.nix @@ -0,0 +1,25 @@ +{ + machine = + { config, pkgs, nodes, ... }: + { + imports = [ + + ./configuration.nix + ]; + + deployment.targetHost = "194.61.20.239"; + + boot.cleanTmpDir = true; + networking.hostName = "nixos"; + networking.firewall.allowPing = true; + services.openssh.enable = true; + services.openssh.forwardX11 = true; + services.openssh.passwordAuthentication = false; + users.users.root.openssh.authorizedKeys.keys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDuhpzDHBPvn8nv8RH1MRomDOaXyP4GziQm7r3MZ1Syk" + ]; + + boot.loader.grub.device = "/dev/vda"; + fileSystems."/" = { device = "/dev/mapper/gc--vg-root"; fsType = "ext4"; }; + }; +} diff --git a/pypi-crawlers/nix/power-wheels-crawler/github_pub_key b/pypi-crawlers/nix/power-wheels-crawler/github_pub_key new file mode 100644 index 0000000..66dd994 --- /dev/null +++ b/pypi-crawlers/nix/power-wheels-crawler/github_pub_key @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ== \ No newline at end of file diff --git a/pypi-crawlers/nix/power-wheels-crawler/hetzner-cloud.nix b/pypi-crawlers/nix/power-wheels-crawler/hetzner-cloud.nix new file mode 100644 index 0000000..48f586e --- /dev/null +++ b/pypi-crawlers/nix/power-wheels-crawler/hetzner-cloud.nix @@ -0,0 +1,19 @@ +{ + network.enableRollback = true; + machine = + { config, pkgs, ... }: + { imports = [ + + ./configuration.nix + ]; + boot.loader.grub.device = "/dev/sda"; + fileSystems."/" = { device = "/dev/sda1"; fsType = "ext4"; }; + boot.cleanTmpDir = true; + networking.firewall.allowPing = true; + services.openssh.enable = true; + users.users.root.openssh.authorizedKeys.keys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDuhpzDHBPvn8nv8RH1MRomDOaXyP4GziQm7r3MZ1Syk" + ]; + deployment.targetHost = "78.47.156.99"; + }; +} diff --git a/pypi-crawlers/nix/python.nix b/pypi-crawlers/nix/python.nix new file mode 100644 index 0000000..08ff939 --- /dev/null +++ b/pypi-crawlers/nix/python.nix @@ -0,0 +1,16 @@ +let + mach-nix = import (builtins.fetchGit { + url = "https://github.com/DavHau/mach-nix/"; + ref = "2.1.0"; + }); +in +mach-nix.mkPython { + requirements = '' + packaging + requests + psycopg2 >= 2.8.0 + pkginfo + peewee + bounded-pool-executor + ''; +} \ No newline at end of file diff --git a/pypi-crawlers/shell.nix b/pypi-crawlers/shell.nix new file mode 100644 index 0000000..6a78495 --- /dev/null +++ b/pypi-crawlers/shell.nix @@ -0,0 +1,18 @@ +let + nixpkgs-src = (import ./nix/nixpkgs-src.nix).stable; + pkgs = import nixpkgs-src {}; + env = ./env; +in +pkgs.mkShell { + buildInputs = [ + (import ./nix/python.nix) + pkgs.nixops + pkgs.nix + ]; + shellHook = '' + export NIX_PATH="nixpkgs=${nixpkgs-src}:." + export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.crt + export PYTHONPATH=$(pwd)/src + source ${env} + ''; +} diff --git a/pypi-crawlers/src/bucket_dict.py b/pypi-crawlers/src/bucket_dict.py new file mode 100644 index 0000000..c22955e --- /dev/null +++ b/pypi-crawlers/src/bucket_dict.py @@ -0,0 +1,104 @@ +import json +import os +from _sha256 import sha256 +from collections import UserDict, OrderedDict + + +class LazyBucketDict(UserDict): + + def __init__(self, directory, data=None, restrict_to_bucket: str = None): + super().__init__() + self._restrict_to_bucket = restrict_to_bucket + self.directory = directory + self.data = {} + if data: + for key, val in data.items(): + self.__setitem__(key, val) + + def __getitem__(self, key): + bucket = self._bucket_secure(key) + self.ensure_bucket_loaded(bucket) + return self.data[bucket][key] + + def __setitem__(self, key, val): + bucket = self._bucket_secure(key) + self.ensure_bucket_loaded(bucket) + self.data[bucket][key] = val + + def __contains__(self, key): + bucket = self._bucket_secure(key) + self.ensure_bucket_loaded(bucket) + return key in self.data[bucket] + + def __delitem__(self, key): + bucket = self._bucket_secure(key) + self.ensure_bucket_loaded(bucket) + del self.data[bucket][key] + + def items(self): + for key in self.keys(): + yield key, self[key] + + @staticmethod + def bucket_keys(): + hexdigits = "0123456789abcdef" + for a in hexdigits: + for b in hexdigits: + yield a + b + + def by_bucket(self, bucket): + self.ensure_bucket_loaded(bucket) + return self.data[bucket] + + def keys(self, bucket=None): + if bucket: + if self._restrict_to_bucket and bucket != self._restrict_to_bucket: + raise Exception( + f'Attempt to access data in bucket {bucket}, while access is restricted to {self._restrict_to_bucket}') + self.ensure_bucket_loaded(bucket) + for k in self.data[bucket].keys(): + yield k + else: + if self._restrict_to_bucket: + buckets = [self._restrict_to_bucket] + else: + buckets = self.bucket_keys() + for bucket in buckets: + self.ensure_bucket_loaded(bucket) + for k in self.data[bucket].keys(): + yield k + + @staticmethod + def bucket(key): + return sha256(key.encode()).hexdigest()[:2] + + def _bucket_secure(self, key): + b = self.bucket(key) + restricted = self._restrict_to_bucket + if restricted and b != restricted: + raise Exception(f'Attempt to access data in bucket {b}, while access is restricted to {restricted}') + return b + + def save_bucket(self, bucket, directory_path): + self.ensure_bucket_loaded(bucket) + save = OrderedDict(sorted(self.data[bucket].items(), key=lambda item: item[0])) + with open(f"{directory_path}/{bucket}.json", 'w') as f: + json.dump(save, f, indent=2) + + def save(self): + if not os.path.isdir(self.directory): + os.mkdir(self.directory) + for bucket in self.data.keys(): + self.save_bucket(bucket, self.directory) + + def load_bucket(self, bucket): + file = f"{self.directory}/{bucket}.json" + if not os.path.isfile(file): + self.data[bucket] = {} + else: + with open(file) as f: + self.data[bucket] = json.load(f) + + def ensure_bucket_loaded(self, bucket): + if bucket not in self.data: + self.load_bucket(bucket) diff --git a/pypi-crawlers/src/crawl_sdist_deps.py b/pypi-crawlers/src/crawl_sdist_deps.py new file mode 100644 index 0000000..5ba369f --- /dev/null +++ b/pypi-crawlers/src/crawl_sdist_deps.py @@ -0,0 +1,242 @@ +import json +import os +import subprocess as sp +import traceback +from dataclasses import dataclass, field +from random import shuffle +from tempfile import TemporaryDirectory +from time import sleep, time +from typing import Union, List, ContextManager + +import utils +from bucket_dict import LazyBucketDict +from db import db, Package, init_db + + +@dataclass +class PackageJob: + name: str + version: str + url: Union[None, str] + sha256: Union[None, str] + idx: int + timeout: int = field(default=60) + + +@dataclass +class JobResult: + name: str + version: str + error: Union[None, str] + install_requires: Union[None, str, list, dict] + setup_requires: Union[None, str, list, dict] + extras_require: Union[None, str, list, dict] + python_requires: Union[None, str, list, dict] + + +def extractor_cmd(pkg_name, pkg_ver, out='./result', url=None, sha256=None, substitutes=True, store=None) -> List[str]: + extractor_dir = os.path.dirname(os.path.abspath(__file__)) + '/extractor/' + base_args = [ + "--arg", "pkg", f'"{pkg_name}"', + "--arg", "version", f'"{pkg_ver}"', + "-o", out + ] + if store: + base_args += ["--store", f"{store}"] + if url and sha256: + cmd = [ + "nix-build", f"{extractor_dir}/fast-extractor.nix", + "--arg", "url", f'"{url}"', + "--arg", "sha256", f'"{sha256}"' + ] + base_args + else: + cmd = [ + "nix-build", f"{extractor_dir}/extractor.nix", + ] + base_args + print('using slow builder') + if not substitutes: + cmd += ["--option", "build-use-substitutes", "false"] + return cmd + + +def format_log(log: str): + """ + Postgres doesn't support indexing large text files. + Therefore we limit line length and count + """ + lines = log.splitlines(keepends=True) + lines = map(lambda line: f"{line[:400]}\n" if len(line) > 400 else line, lines) + remove_lines_marker = ( + '/homeless-shelter/.cache/pip/http', + '/homeless-shelter/.cache/pip', + 'DEPRECATION: Python 2.7' + ) + filtered = filter(lambda l: not any(marker in l for marker in remove_lines_marker), lines) + return ''.join(list(filtered)[:90]) + + +def extract_requirements(job: PackageJob): + py_versions = ('python27', 'python35', 'python36', 'python37', 'python38') + try: + print(f"processing package nr. {job.idx} - {job.name}:{job.version}") + store = os.environ.get('STORE', None) + with TemporaryDirectory() as tempdir: + out_dir = f"{tempdir}/json" + cmd = extractor_cmd(job.name, job.version, out_dir, job.url, job.sha256, + store=store) + #print(' '.join(cmd).replace(' "', ' \'"').replace('" ', '"\' ')) + try: + sp.run(cmd, capture_output=True, timeout=job.timeout, check=True) + except (sp.CalledProcessError, sp.TimeoutExpired) as e: + print(f"problem with {job.name}:{job.version}") + print(e.stderr.decode()) + formatted = format_log(e.stderr.decode()) + return [dict( + name=job.name, + version=job.version, + py_ver=f"{py_ver}", + error=formatted, + ) for py_ver in py_versions] + results = [] + for py_ver in py_versions: + data = None + try: + path = os.readlink(f"{out_dir}") + if store: + path = path.replace('/nix/store', f"{store}/nix/store") + with open(f"{path}/{py_ver}.json") as f: + content = f.read().strip() + if content != '': + data = json.loads(content) + except FileNotFoundError: + pass + if data is None: + with open(f"{path}/{py_ver}.log") as f: + error = format_log(f.read()) + print(error) + results.append(dict( + name=job.name, + version=job.version, + py_ver=f"{py_ver}", + error=error, + )) + else: + results.append(dict( + name=job.name, + version=job.version, + py_ver=py_ver, + **data + )) + return results + except Exception as e: + traceback.print_exc() + return e + + +def get_jobs(pypi_fetcher_dir, bucket, processed, amount=1000): + pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi", restrict_to_bucket=bucket) + jobs = [] + names = list(pypi_dict.by_bucket(bucket).keys()) + total_nr = 0 + for pkg_name in names: + for ver, release_types in pypi_dict[pkg_name].items(): + if 'sdist' not in release_types: + continue + if (pkg_name, ver) in processed: + continue + total_nr += 1 + release = release_types['sdist'] + if len(jobs) <= amount: + jobs.append(PackageJob( + pkg_name, + ver, + f"https://files.pythonhosted.org/packages/source/{pkg_name[0]}/{pkg_name}/{release[1]}", + release[0], + 0, + )) + shuffle(jobs) + for i, job in enumerate(jobs): + job.idx = i + print(f"Bucket {bucket}: Planning execution of {len(jobs)} jobs out of {total_nr} total jobs for this bucket") + return jobs + + +def get_processed(): + with open('/tmp/jobs', 'r') as f: + return {tuple(t) for t in json.load(f)} + + +def build_base(store=None): + # make sure base stuff gets back into cache after cleanup: + cmd = extractor_cmd("requests", "2.22.0", out='/tmp/dummy', url='https://files.pythonhosted.org/packages/01/62/ddcf76d1d19885e8579acb1b1df26a852b03472c0e46d2b959a714c90608/requests-2.22.0.tar.gz', + sha256='11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4', store=store) + sp.check_call(cmd, timeout=1000) + + +def cleanup(): + sp.check_call('rm -rf ./dummy', shell=True) + cmd = "nix-collect-garbage" + store = os.environ.get('almighty_store', None) + if store: + cmd += f" --store {store}" + sp.check_call(cmd, shell=True) + + +def ensure_pypi_fetcher(dir): + if not os.path.isdir(dir): + cmd = f'git clone git@github.com:DavHau/nix-pypi-fetcher.git {dir}' + sp.check_call(cmd, shell=True) + sp.check_call("git checkout master && git pull", shell=True, cwd=dir) + + +class Measure(ContextManager): + def __init__(self, name): + self.name = name + def __enter__(self): + self.enter_time = time() + print(f'beginning "{self.name}"') + def __exit__(self, exc_type, exc_val, exc_tb): + dur = round(time() - self.enter_time, 1) + print(f'"{self.name}" took {dur}s') + + +def main(): + workers = int(os.environ.get('WORKERS', "1")) + pypi_fetcher_dir = os.environ.get('pypi_fetcher', '/tmp/pypi_fetcher') + ensure_pypi_fetcher(pypi_fetcher_dir) + init_db() + build_base(store=os.environ.get('STORE', None)) + P = Package + with Measure('Get processed pkgs from DB'): + processed = set((p.name, p.version) for p in P.select(P.name, P.version).distinct()) + print(f"DB contains {len(processed)} pkgs at this time") + for bucket in LazyBucketDict.bucket_keys(): + with Measure("getting jobs"): + jobs = get_jobs(pypi_fetcher_dir, bucket, processed, amount=1000) + if not jobs: + continue + with Measure('batch'): + if workers > 1: + pool_results = utils.parallel(extract_requirements, (jobs,), workers=workers, use_processes=False) + else: + pool_results = [extract_requirements(args) for args in jobs] + results = [] + for i, res in enumerate(pool_results): + if isinstance(res, Exception): + print(f"Problem with {jobs[i].name}:{jobs[i].version}") + if isinstance(res, sp.CalledProcessError): + print(res.stderr) + traceback.print_exception(res, res, res.__traceback__) + else: + for r in res: + results.append(r) + sleep(1) + with db.atomic(): + with Measure('bulk insert'): + Package.bulk_create([Package(**r) for r in results]) + if os.environ.get('CLEANUP', None): + cleanup() + + +if __name__ == "__main__": + main() diff --git a/pypi-crawlers/src/crawl_urls.py b/pypi-crawlers/src/crawl_urls.py new file mode 100644 index 0000000..9cdee47 --- /dev/null +++ b/pypi-crawlers/src/crawl_urls.py @@ -0,0 +1,113 @@ +import os +import sys +import traceback +import xmlrpc.client +from time import sleep + +import requests +import utils +from bucket_dict import LazyBucketDict +from requests import HTTPError + +base_url = "https://pypi.org/pypi" +session = requests.Session() +email = os.environ.get("EMAIL") +if not email: + raise Exception("Please provide EMAIL=") +headers = {'User-Agent': f'Pypi Daily Sync (Contact: {email})'} + + +def all_packages(): + xmlclient = xmlrpc.client.ServerProxy(base_url) + return xmlclient.list_packages_with_serial() + + +def pkg_meta(name): + resp = session.get(f"{base_url}/{name}/json", headers=headers) + resp.raise_for_status() + return resp.json() + + +def select_favorite_sdist_release(sdist_releases): + """ + Selects one sdist from a list while prioritizing the file suffixes + (tar.gz, tgz, zip, tar.bz2) (left == better). + If multiple filenames with same suffix exist, the shortest filename is picked + """ + sdist_releases = list(sdist_releases) + f_types = ('tar.gz', '.tgz', '.zip', '.tar.bz2') + compatible_releases = filter(lambda r: any(r['filename'].endswith(end) for end in f_types), + sdist_releases) + sorted_releases = sorted(compatible_releases, + key=lambda r: next(i for i, t in enumerate(f_types) if r['filename'].endswith(t))) + if not sorted_releases: + return [] + return sorted_releases[0] + + +def save_pkg_meta(name, pkgs_dict): + api_success = False + while not api_success: + try: + meta = pkg_meta(name) + api_success = True + except HTTPError as e: + if e.response.status_code == 404: + return + except: + traceback.print_exc() + print("Warning! problems accessing pypi api. Will retry in 10s") + sleep(10) + releases_dict = {} + # iterate over versions of current package + for release_ver, release in meta['releases'].items(): + sdists = filter(lambda file: file['packagetype'] in ["sdist"], release) + sdist = select_favorite_sdist_release(sdists) + wheels = list(filter(lambda file: file['packagetype'] in ["bdist_wheel"], release)) + if not (sdist or wheels): + continue + releases_dict[release_ver] = {} + if sdist: + releases_dict[release_ver]['sdist'] = [ + sdist['digests']['sha256'], + sdist['filename'], + ] + if wheels: + releases_dict[release_ver]['wheels'] = { + wheel['filename']: (wheel['digests']['sha256'], wheel['python_version']) + for wheel in wheels + } + if releases_dict: + pkgs_dict[name.replace('_', '-').lower()] = releases_dict + + +def crawl_pkgs_meta(packages, target_dir, workers): + pkgs_dict = LazyBucketDict(target_dir) + args_list = [(name, pkgs_dict) for name in packages] + if workers > 1: + utils.parallel(save_pkg_meta, zip(*args_list), workers=workers) + else: + [save_pkg_meta(*args) for args in args_list] + pkgs_dict.save() + + +def names_in_buckets(): + in_buckets = {} + for name in all_packages(): + bucket = LazyBucketDict.bucket(name.replace('_', '-').lower()) + if bucket not in in_buckets: + in_buckets[bucket] = [] + in_buckets[bucket].append(name) + return in_buckets + + +def main(): + target_dir = sys.argv[1] + workers = int(os.environ.get('WORKERS', "1")) + for i, names in enumerate(names_in_buckets().values()): + print(f"crawling bucket nr. {i}") + crawl_pkgs_meta(names, target_dir, workers=workers) + + +if __name__ == "__main__": + main() diff --git a/pypi-crawlers/src/crawl_wheel_deps.py b/pypi-crawlers/src/crawl_wheel_deps.py new file mode 100644 index 0000000..84d2595 --- /dev/null +++ b/pypi-crawlers/src/crawl_wheel_deps.py @@ -0,0 +1,214 @@ +import json +import os +import sys +import traceback +import zipfile +from dataclasses import dataclass +from os.path import isdir +from random import shuffle +from tempfile import NamedTemporaryFile +from time import sleep +from typing import Union + +import pkginfo +import requests + +from bucket_dict import LazyBucketDict +from utils import parallel + + +email = os.environ.get("EMAIL") +if not email: + raise Exception("Please provide EMAIL=") +headers = {'User-Agent': f'Pypi Daily Sync (Contact: {email})'} + + +@dataclass +class Job: + name: str + ver: str + filename: str + pyver: str + url: str + nr: int + bucket: str + + +@dataclass() +class Result: + job: Job + requires_dist: str + provides_extras: str + requires_external: str + requires_python: str + + +class Retry(Exception): + pass + + +def construct_url(name, pyver, filename: str): + base_url = "https://files.pythonhosted.org/packages/" + return f"{base_url}{pyver}/{name[0]}/{name}/{filename}" + + +def mine_wheel_metadata_full_download(job: Job) -> Union[Result, Exception]: + print(f"Bucket {job.bucket} - Job {job.nr} - {job.name}:{job.ver}") + for _ in range(5): + try: + with NamedTemporaryFile(suffix='.whl') as f: + resp = requests.get(job.url, headers=headers) + if resp.status_code == 404: + return requests.HTTPError() + if resp.status_code in [503, 502]: + try: + resp.raise_for_status() + except: + traceback.print_exc() + raise Retry + resp.raise_for_status() + with open(f.name, 'wb') as f_write: + f_write.write(resp.content) + metadata = pkginfo.get_metadata(f.name) + return Result( + job=job, + requires_dist=metadata.requires_dist, + provides_extras=metadata.provides_extras, + requires_external=metadata.requires_external, + requires_python=metadata.requires_python, + ) + except Retry: + sleep(10) + except zipfile.BadZipFile as e: + return e + except Exception: + print(f"Problem with {job.name}:{job.ver}") + traceback.print_exc() + raise + + +def is_done(dump_dict, pkg_name, pkg_ver, pyver, filename): + try: + dump_dict[pkg_name][pyver][pkg_ver][filename] + except KeyError: + return False + else: + return True + + +def get_jobs(bucket, pypi_dict:LazyBucketDict, dump_dict: LazyBucketDict): + names = list(pypi_dict.by_bucket(bucket).keys()) + jobs = [] + for pkg_name in names: + for ver, release_types in pypi_dict[pkg_name].items(): + if 'wheels' not in release_types: + continue + for filename, data in release_types['wheels'].items(): + pyver = data[1] + if is_done(dump_dict, pkg_name, ver, pyver, filename): + continue + url = construct_url(pkg_name, pyver, filename) + jobs.append(dict( + name=pkg_name, ver=ver, filename=filename, pyver=pyver, + url=url, bucket=bucket)) + shuffle(jobs) + return [Job(**j, nr=idx) for idx, j in enumerate(jobs)] + + +def sort(d: dict): + res = {} + for k, v in sorted(d.items()): + if isinstance(v, dict): + res[k] = sort(v) + else: + res[k] = v + return res + + +def decompress(d): + for name, pyvers in d.items(): + for pyver, pkg_vers in pyvers.items(): + for pkg_ver, fnames in pkg_vers.items(): + for fn, data in fnames.items(): + if isinstance(data, str): + key_ver, key_fn = data.split('@') + try: + pkg_vers[key_ver][key_fn] + except KeyError: + print(f"Error with key_ver: {key_ver} , key_fn: {key_fn}") + exit() + fnames[fn] = pkg_vers[key_ver][key_fn] + + +def compress(dump_dict): + decompress(dump_dict) + # sort + for k, v in dump_dict.items(): + dump_dict[k] = sort(v) + for name, pyvers in dump_dict.items(): + for pyver, pkg_vers in pyvers.items(): + + all_fnames = {} + for pkg_ver, fnames in pkg_vers.items(): + for fn, data in fnames.items(): + for existing_key, d in all_fnames.items(): + if data == d: + fnames[fn] = existing_key + break + if not isinstance(fnames[fn], str): + all_fnames[f"{pkg_ver}@{fn}"] = data + + +def exec_or_return_exc(func, job): + try: + return func(job) + except Exception as e: + traceback.print_exc() + return e + + +def main(): + dump_dir = sys.argv[1] + workers = int(os.environ.get('WORKERS', "1")) + pypi_fetcher_dir = os.environ.get('pypi_fetcher') + print(f'Index directory: {pypi_fetcher_dir}') + assert isdir(pypi_fetcher_dir) + for bucket in LazyBucketDict.bucket_keys(): + print(f"Begin wit bucket {bucket}") + pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi") + dump_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket) + jobs = list(get_jobs(bucket, pypi_dict, dump_dict)) + if not jobs: + continue + print(f"Starting batch with {len(jobs)} jobs") + func = mine_wheel_metadata_full_download + if workers > 1: + def f(job): + return exec_or_return_exc(func, job) + result = parallel(f, (jobs,), workers=workers) + else: + result = [exec_or_return_exc(func, job) for job in jobs] + for r in result: + if isinstance(r, Exception): + continue + name = r.job.name + ver = r.job.ver + pyver = r.job.pyver + fn = r.job.filename + if name not in dump_dict: + dump_dict[name] = {} + if pyver not in dump_dict[name]: + dump_dict[name][pyver] = {} + if ver not in dump_dict[name][pyver]: + dump_dict[name][pyver][ver] = {} + dump_dict[name][pyver][ver][fn] = {} + for key in ('requires_dist', 'provides_extras', 'requires_external', 'requires_python'): + val = getattr(r, key) + if val: + dump_dict[name][pyver][ver][fn][key] = val + compress(dump_dict) + dump_dict.save() + + +if __name__ == "__main__": + main() diff --git a/pypi-crawlers/src/db.py b/pypi-crawlers/src/db.py new file mode 100644 index 0000000..768db7f --- /dev/null +++ b/pypi-crawlers/src/db.py @@ -0,0 +1,54 @@ +import os + +from peewee import * +from playhouse.postgres_ext import * + +db = PostgresqlExtDatabase( + 'almighty', + user='almighty', + password=os.environ.get('DB_PASS'), + host=os.environ.get('DB_HOST'), + port=5432 +) + + +class BaseModel(Model): + class Meta: + database = db + + +class Package(BaseModel): + name = CharField(index=True) + version = CharField(index=True) + py_ver = CharField(index=True) + error = TextField(null=True, index=True) + install_requires = BinaryJSONField(null=True) + setup_requires = BinaryJSONField(null=True) + extras_require = BinaryJSONField(null=True) + tests_require = BinaryJSONField(null=True) + python_requires = BinaryJSONField(null=True) + class Meta: + indexes = ( + (('name', 'version', 'py_ver'), True), + ) + + @classmethod + def defaults(cls): + return dict( + error=None, + install_requires=None, + setup_requires=None, + extras_require=None, + tests_require=None, + python_requires=None, + ) + + +def init_db(): + pass + db.drop_tables([]) + db.create_tables([Package]) + + +if __name__ == "__main__": + init_db() diff --git a/pypi-crawlers/src/dump_sdist_deps.py b/pypi-crawlers/src/dump_sdist_deps.py new file mode 100644 index 0000000..f2c61fe --- /dev/null +++ b/pypi-crawlers/src/dump_sdist_deps.py @@ -0,0 +1,147 @@ +import sys +from dataclasses import asdict, dataclass +from typing import Set, Dict + +from packaging.version import parse + +from bucket_dict import LazyBucketDict +from db import Package as P + + +@dataclass +class PKG: + install_requires: str + setup_requires: str + extras_require: str + tests_require: str + python_requires: str + + +def flatten_req_list(obj): + if isinstance(obj, str): + yield obj + elif isinstance(obj, list): + if len(obj) == 0: + return + elif len(obj) == 1: + for s in flatten_req_list(obj[0]): + yield s + else: + for elem in obj: + for s in flatten_req_list(elem): + yield s + else: + raise Exception('Is not list or str') + + +flatten_keys = ( + 'setup_requires', + 'install_requires', + 'tests_require', + 'python_requires', +) + + +def pkg_to_dict(pkg): + pkg_dict = asdict(PKG( + install_requires=pkg.install_requires, + setup_requires=pkg.setup_requires, + extras_require=pkg.extras_require, + tests_require=pkg.tests_require, + python_requires=pkg.python_requires + )) + new_release = {} + for key, val in pkg_dict.items(): + if not val: + continue + if key == 'extras_require': + for extra_key, extra_reqs in val.items(): + val[extra_key] = list(flatten_req_list(extra_reqs)) + if key not in flatten_keys: + new_release[key] = val + continue + val = list(flatten_req_list(val)) + if isinstance(val, str): + val = [val] + if not all(isinstance(elem, str) for elem in val): + print(val) + raise Exception('Requirements must be list of strings') + new_release[key] = val + return new_release + + +def insert(py_ver, name, ver, release, target): + ver = str(parse(ver)) + # create structure + if name not in target: + target[name] = {} + if ver not in target[name]: + target[name][ver] = {} + # if exact same pkg data already exists for another version, + # just refer to other version to prevent duplicates + for py, existing_pkg in target[name][ver].items(): + if release == existing_pkg: + target[name][ver][py_ver] = py + return + target[name][ver][py_ver] = release + + +def get_names_per_bucket() -> Dict[str, Set[str]]: + result = {} + hexdigits = "0123456789abcdef" + for a in hexdigits: + for b in hexdigits: + result[a + b] = set() + keys = [p.name for p in P.select(P.name).distinct()] + for key in keys: + result[LazyBucketDict.bucket(key)].add(key) + return result + + +def compress_dict(d, sort=True): + if sort: + items = sorted(d.items(), key=lambda x: x[0]) + else: + items = d.items() + keep = {} + for k, v in items: + for keep_key, keep_val in keep.items(): + if v == keep_val: + d[k] = keep_key + break + if not isinstance(d[k], str): + keep[k] = v + + +def compress(pkgs_dict: LazyBucketDict): + for name, vers in pkgs_dict.items(): + for ver, pyvers in vers.items(): + compress_dict(pyvers) + compress_dict(vers) + + +def main(): + dump_dir = sys.argv[1] + for bucket_key, key_set in get_names_per_bucket().items(): + pkgs_dict = LazyBucketDict(f"{dump_dir}", restrict_to_bucket=bucket_key) + pkgs = P.select( + P.id, + P.name, + P.version, + P.py_ver, + P.install_requires, + P.setup_requires, + P.extras_require, + P.tests_require, + P.python_requires, + ).where(P.error.is_null(), P.name.in_(key_set)) + print(f'dumping bucket {bucket_key}') + for pkg in sorted(pkgs, key=lambda pkg: (pkg.name, pkg.version, pkg.py_ver)): + py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver)) + insert(py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), pkgs_dict) + compress(pkgs_dict) + pkgs_dict.save() + + +if __name__ == "__main__": + main() diff --git a/pypi-crawlers/src/extractor/default.nix b/pypi-crawlers/src/extractor/default.nix new file mode 100644 index 0000000..cdfe1f3 --- /dev/null +++ b/pypi-crawlers/src/extractor/default.nix @@ -0,0 +1,132 @@ +let + pkgs = import { config = { allowUnfree = true; }; overlays = []; }; + commit = "1434cc0ee2da462f0c719d3a4c0ab4c87d0931e7"; + fetchPypiSrc = builtins.fetchTarball { + name = "nix-pypi-fetcher"; + url = "https://github.com/DavHau/nix-pypi-fetcher/archive/${commit}.tar.gz"; + # Hash obtained using `nix-prefetch-url --unpack ` + sha256 = "080l189zzwrv75jgr7agvs4hjv4i613j86d4qky154fw5ncp0mnp"; + }; + fetchPypi = import (fetchPypiSrc); + patchDistutils = python_env: + with builtins; + let + verSplit = split "[\.]" python_env.python.version; + major = elemAt verSplit 0; + minor = elemAt verSplit 2; + lib_dir = "$out/lib/python${major}.${minor}"; + site_pkgs_dir = "${lib_dir}/site-packages"; + in + pkgs.symlinkJoin { + name = "${python_env.name}-patched"; + paths = [ python_env ]; + postBuild = '' + ### Distutils + # symlinks to files + mkdir ${lib_dir}/distutils_tmp + cp -a ${lib_dir}/distutils/* ${lib_dir}/distutils_tmp/ + rm ${lib_dir}/distutils + mv ${lib_dir}/distutils_tmp ${lib_dir}/distutils + # patch distutils/core.py + patch ${lib_dir}/distutils/core.py ${./distutils.patch} + # remove .pyc files + + if [ ${major} = 2 ]; then + rm ${lib_dir}/distutils/core.pyc + else + chmod +w ${lib_dir}/distutils/__pycache__/ + rm ${lib_dir}/distutils/__pycache__/core.* + fi + + + ### Setuptools + # symlinks to files + mkdir ${site_pkgs_dir}/setuptools_tmp + cp -a ${site_pkgs_dir}/setuptools/* ${site_pkgs_dir}/setuptools_tmp/ + rm ${site_pkgs_dir}/setuptools + mv ${site_pkgs_dir}/setuptools_tmp ${site_pkgs_dir}/setuptools + # patch setuptools/__init__.py + echo ${site_pkgs_dir}/setuptools/__init__.py + patch ${site_pkgs_dir}/setuptools/__init__.py ${./setuptools.patch} + # remove .pyc files + if [ ${major} = 2 ]; then + rm ${site_pkgs_dir}/setuptools/__init__.pyc + else + chmod +w ${site_pkgs_dir}/setuptools/__pycache__ + rm ${site_pkgs_dir}/setuptools/__pycache__/__init__.* + fi + + # fix executables + for f in $(ls ${python_env}/bin); do + sed -i "s|${python_env}|$out|g" $out/bin/$f + sed -i "/NIX_PYTHONPATH/a export PYTHONPATH=$out\/lib\/python${major}.${minor}" $out/bin/$f + done + ''; + }; + + mkPy = python: + let + python_env = python.withPackages (ps: with ps; [ + # base requirements + setuptools + pkgconfig + ]); + in + patchDistutils python_env; + +in +let + py27 = mkPy pkgs.python27; + py35 = mkPy pkgs.python35; + py36 = mkPy pkgs.python36; + py37 = mkPy pkgs.python37; + py38 = mkPy pkgs.python38; + # This is how pip invokes setup.py. We do this manually instead of using pip to increase performance by ~40% + setuptools_shim = '' + import sys, setuptools, tokenize; sys.argv[0] = 'setup.py'; __file__='setup.py'; + f=getattr(tokenize, 'open', open)(__file__); + code=f.read().replace('\r\n', '\n'); + f.close(); + exec(compile(code, __file__, 'exec')) + ''; + script = '' + mkdir $out + echo "python27" + out_file=$out/python27.json ${py27}/bin/python -c "${setuptools_shim}" install &> $out/python27.log || true + echo "python35" + out_file=$out/python35.json ${py35}/bin/python -c "${setuptools_shim}" install &> $out/python35.log || true + echo "python36" + out_file=$out/python36.json ${py36}/bin/python -c "${setuptools_shim}" install &> $out/python36.log || true + echo "python37" + out_file=$out/python37.json ${py37}/bin/python -c "${setuptools_shim}" install &> $out/python37.log || true + echo "python38" + out_file=$out/python38.json ${py38}/bin/python -c "${setuptools_shim}" install &> $out/python38.log || true + ''; + base_derivation = with pkgs; { + buildInputs = [ unzip pkg-config pipenv ]; + phases = ["unpackPhase" "installPhase"]; + # Tells our modified python builtins to dump setup attributes instead of doing an actual installation + dump_setup_attrs = "y"; + PYTHONIOENCODING = "utf8"; # My gut feeling is that encoding issues might decrease by this + installPhase = script; + }; +in +with pkgs; +rec { + inherit py27 py35 py36 py37 py38; + all = { inherit py27 py35 py36 py37 py38; }; + inherit machnix_source; + example = extractor {pkg = "requests"; version = "2.22.0";}; + extractor = {pkg, version}: + stdenv.mkDerivation ({ + name = "${pkg}-${version}-requirements"; + src = fetchPypi pkg version; + } // base_derivation); + extractor-fast = {pkg, version, url, sha256}: + stdenv.mkDerivation ({ + name = "${pkg}-${version}-requirements"; + src = pkgs.fetchurl { + inherit url sha256; + }; + } // base_derivation); +} diff --git a/pypi-crawlers/src/extractor/distutils.patch b/pypi-crawlers/src/extractor/distutils.patch new file mode 100644 index 0000000..c008088 --- /dev/null +++ b/pypi-crawlers/src/extractor/distutils.patch @@ -0,0 +1,55 @@ +diff --git a/Lib/distutils/core.py b/Lib/distutils/core.py +index d603d4a45a..a589477b8e 100644 +--- a/Lib/distutils/core.py ++++ b/Lib/distutils/core.py +@@ -120,6 +120,50 @@ def setup (**attrs): + # the setup script, but be overridden by the command line. + dist.parse_config_files() + ++ def dump(): ++ def jsonify(obj): ++ if isinstance(obj, str): ++ return obj ++ if sys.version_info < (3, 0) and isinstance(obj, unicode): ++ return str(obj) ++ if isinstance(obj, bytes): ++ return obj.decode() ++ if isinstance(obj, dict): ++ return {jsonify(key): jsonify(val) for key, val in obj.items()} ++ try: ++ # convert to list if possible ++ return [jsonify(elem) for elem in obj] ++ except: ++ pass ++ # fallback to string repr. of obj ++ return str(obj) ++ ++ keys = ( ++ 'install_requires', ++ 'setup_requires', ++ 'extras_require', ++ 'tests_require', ++ 'python_requires' ++ ) ++ data = {} ++ for key in keys: ++ val = getattr(dist, key, None) ++ if not val: ++ continue ++ data[key] = jsonify(val) ++ return data ++ if os.environ.get("dump_setup_attrs", None): ++ import json ++ try: ++ data = dump() ++ except: ++ import traceback ++ data = dict(traceback=traceback.format_exc()) ++ out = os.environ.get("out_file") ++ with open(out, 'w') as f: ++ json.dump(data, f, indent=2) ++ exit() ++ + if DEBUG: + print("options (after parsing config files):") + dist.dump_option_dicts() diff --git a/pypi-crawlers/src/extractor/extractor.nix b/pypi-crawlers/src/extractor/extractor.nix new file mode 100644 index 0000000..f622dfd --- /dev/null +++ b/pypi-crawlers/src/extractor/extractor.nix @@ -0,0 +1 @@ +(import ./default.nix).extractor diff --git a/pypi-crawlers/src/extractor/fast-extractor.nix b/pypi-crawlers/src/extractor/fast-extractor.nix new file mode 100644 index 0000000..47553fa --- /dev/null +++ b/pypi-crawlers/src/extractor/fast-extractor.nix @@ -0,0 +1 @@ +(import ./default.nix).extractor-fast diff --git a/pypi-crawlers/src/extractor/setuptools.patch b/pypi-crawlers/src/extractor/setuptools.patch new file mode 100644 index 0000000..1e96b2f --- /dev/null +++ b/pypi-crawlers/src/extractor/setuptools.patch @@ -0,0 +1,13 @@ +diff --git a/setuptools/__init__.py b/setuptools/__init__.py +index 83882511..6e762e3d 100644 +--- a/setuptools/__init__.py ++++ b/setuptools/__init__.py +@@ -251,3 +251,8 @@ class sic(str): + + # Apply monkey patches + monkey.patch_all() ++def setup(**attrs): ++ # Make sure we have any requirements needed to interpret 'attrs'. ++ if not os.environ.get("dump_setup_attrs", None): ++ _install_setup_requires(attrs) ++ return distutils.core.setup(**attrs) diff --git a/pypi-crawlers/src/utils.py b/pypi-crawlers/src/utils.py new file mode 100644 index 0000000..0af3278 --- /dev/null +++ b/pypi-crawlers/src/utils.py @@ -0,0 +1,11 @@ +from bounded_pool_executor import BoundedThreadPoolExecutor, BoundedProcessPoolExecutor + + +def parallel(func, args_list, workers=10, use_processes=False): + if use_processes: + Executor = BoundedProcessPoolExecutor + else: + Executor = BoundedThreadPoolExecutor + with Executor(max_workers=workers) as tpe: + res = tpe.map(func, *args_list) + return list(res)