Skip to content

Commit

Permalink
bug/gen pypi map api incompat (#224)
Browse files Browse the repository at this point in the history
* Introducing a BQ variant of PackageInfo

* Update README.md

* Avoid duplication in main()

* More time components in progress countdown

Seconds or Days is a bit too coarse for running locally, I found myself
converting the seconds by hand throughout the run.
  • Loading branch information
blast-hardcheese authored Jan 24, 2024
1 parent f6ebc5f commit 934e8bc
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 25 deletions.
20 changes: 7 additions & 13 deletions internal/backends/python/gen_pypi_map/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The stats are downloaded from a public BigQuery table made available
by Pypi. To download the stats and update the `download_stats.json` file:

```bash
go run ./gen_pypi_map -cmd bq -gcp <gcp-project-name>
go run ./gen_pypi_map bq -gcp <gcp-project-name>
```

The gcp-project-name can be any replit gcp project, because the table we are accessing `bigquery-public-data.pypi.file_downloads` is public. More info here: <https://packaging.python.org/en/latest/guides/analyzing-pypi-package-downloads/>
Expand All @@ -33,20 +33,14 @@ is:
2. use pkgutil to see what new modules were added compared to before

We used to run this test on all modules on pypi. Now we have the option to
run it only on a subset of modules. You may use the `gen_top_packages.py` script
to generate a list of the top packages to test. The default is to collect the
top 10000 packages. You can change this by passing in a different number to the
optional `-n` flag. For example, to generate the top 50000 packages and output
it to `pkgs_to_test.json`:
run it only on a subset of modules. The default is to collect the top
10000 packages. You can change this by passing in a different number to the
optional `-threshold` flag.

```bash
./gen_top_packages.py -n 50000 pkgs_to_test.json
```

This may then be used to test the packages and update `pkgs.json` with:
For example, to test the top 50000 packages:

```bash
go run ./gen_pypi_map/ -cmd test -index pkgs_to_test.json
go run ./gen_pypi_map/ test -threshold 50000
```

The test results for all tested packages will be stored in `pkgs.json` along with
Expand All @@ -60,7 +54,7 @@ want to force a retest of the packages, you can use the `-force` flag.
Finally, we use the collected data to generate the code file. This is done with:

```bash
go run ./gen_pypi_map/ -cmd gen
go run ./gen_pypi_map/ gen
```

Or:
Expand Down
4 changes: 2 additions & 2 deletions internal/backends/python/gen_pypi_map/download_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ func GetPypiStats(projectID string) (map[string]int, error) {

packages := map[string]int{}
for {
var info PackageInfo
var info BqPackageInfo
err := it.Next(&info)
if err == iterator.Done {
break
}
if err != nil {
return packages, err
}
packages[info.Name] = info.Downloads.LastMonth
packages[info.Name] = info.Downloads
}

return packages, nil
Expand Down
15 changes: 7 additions & 8 deletions internal/backends/python/gen_pypi_map/gen_pypi_map.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,19 +163,18 @@ func main() {
}
if cmd, ok := validCmds[command]; ok {
cmd(os.Args[2:])
} else if command != "" {
choices := make([]string, 0, len(validCmds))
for cmd := range validCmds {
choices = append(choices, cmd)
}
sort.Strings(choices)
fmt.Fprintf(os.Stderr, "Error: Invalid command '%s'.\nValid commands are %s.\n", command, strings.Join(choices, ", "))
} else {
var msg string
if command != "" {
msg = fmt.Sprintf("Invalid command '%s'.", command)
} else {
msg = "No command provided."
}
choices := make([]string, 0, len(validCmds))
for cmd := range validCmds {
choices = append(choices, cmd)
}
sort.Strings(choices)
fmt.Fprintf(os.Stderr, "Error: No command provided.\nValid commands are %s.\n", strings.Join(choices, ", "))
fmt.Fprintf(os.Stderr, "Error: %s\nValid commands are %s.\n", msg, strings.Join(choices, ", "))
}
}
12 changes: 10 additions & 2 deletions internal/backends/python/gen_pypi_map/test_modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,17 @@ func TestModules(packages PackageIndex, cacheDir string, pkgsFile string, distMo

elapsed := time.Since(startTime)
rate := float64(processedPackages) / elapsed.Seconds()
remaining := float64(discoveredPackages-processedPackages) / rate
remaining := int(float64(discoveredPackages-processedPackages) / rate)

fmt.Printf("%v/%v %.2f%% [%.0fs %.0fd]\n", processedPackages, discoveredPackages, 100*percentage, remaining, remaining/60/60/24)
days := remaining / 60 / 60 / 24
remaining = remaining % (60 * 60 * 24)
hours := remaining / 60 / 60
remaining = remaining % (60 * 60)
minutes := remaining / 60
remaining = remaining % 60
seconds := remaining

fmt.Printf("%v/%v %.2f%% [%02dd%02dh%02dm%02ds]\n", processedPackages, discoveredPackages, 100*percentage, days, hours, minutes, seconds)
}
}

Expand Down
9 changes: 9 additions & 0 deletions internal/backends/python/gen_pypi_map/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ import (

type PackageCache = map[string]PackageInfo

// PackageInfo is very similar between BigQuery and PyPi,
// save for "downloads" which is an int in one, and an object in another.
type BqPackageInfo struct {
Name string `json:"name,omitempty"`
Downloads int `json:"downloads,string,omitempty"`
Version string `json:"version,omitempty"`
RequiresDist []string `json:"requires_dist,omitempty"`
}

type DownloadsInfo struct {
LastDay int `json:"last_day"`
LastWeek int `json:"last_week"`
Expand Down

0 comments on commit 934e8bc

Please sign in to comment.