-
Notifications
You must be signed in to change notification settings - Fork 207
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add scripts for downloading open source corpora. (#4193)
Add scripts for downloading open source corpora.
- Loading branch information
1 parent
cdee9bc
commit 1b0e339
Showing
9 changed files
with
429 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Don't commit the downloaded files. | ||
download/ | ||
out/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
This directory contains a package with scripts for downloading corpora of open | ||
source Dart code for automated analysis. There are a few scripts for | ||
downloading from various places: | ||
|
||
* `clone_flutter_apps.dart`: Clones GitHub repositories linked to from | ||
[github.com/tortuvshin/open-source-flutter-apps](https://github.com/tortuvshin/open-source-flutter-apps), which is a registry of open source Flutter apps. | ||
Downloads them to `download/apps`. | ||
|
||
* `clone_widgets.apps.dart`: Clones GitHub repositories referenced by | ||
[itsallwidgets.com](https://itsallwidgets.com/), which is a collection of | ||
open source Flutter apps and widgets. Downloads them to `download/widgets`. | ||
|
||
* `download_packages.dart`: Downloads recent packages from | ||
[pub.dev](https://pub.dev/). Downloads to `download/pub`. | ||
|
||
Once a corpus is downloaded, there is another script that copies over just the | ||
`.dart` files while discardinging "uninteresting" files like generated ones: | ||
|
||
* `copy_corpus.dart`: Copies `.dart` files from one of the download | ||
directories. Pass `apps`, `widgets`, `pub`, etc. Can also copy sources from | ||
the Dart SDK repo (`dart`) or Flutter repo (`flutter`). For that to work, | ||
those repos must be in directories next to the language repo. | ||
|
||
You can pass `--sample=<percent>` to take a random sample of a corpus. For | ||
example, `--sample=5` will copy over only 5% of the files, chosen randomly. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import 'package:corpus/utils.dart'; | ||
|
||
/// Match URIs that point to GitHub repos. Look for a trailing ")" (after an | ||
/// allowed trailing "/") in order to only find Markdown link URIs that are | ||
/// directly to repos and not to paths within them like the images in the | ||
/// header. | ||
final _gitHubRepoPattern = | ||
RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)/?\)'); | ||
|
||
const _readmeUri = | ||
'https://raw.githubusercontent.com/tortuvshin/open-source-flutter-apps/' | ||
'refs/heads/master/README.md'; | ||
|
||
/// Clones the GitHub repos listed on: | ||
/// | ||
/// https://github.com/tortuvshin/open-source-flutter-apps | ||
/// | ||
/// Downloads them to downloads/apps. | ||
void main(List<String> arguments) async { | ||
clean('download/apps'); | ||
|
||
print('Getting README.md...'); | ||
var readme = await httpGet(_readmeUri); | ||
|
||
// Find all the repo URLs and remove the duplicates. | ||
var repoPaths = _gitHubRepoPattern | ||
.allMatches(readme) | ||
.map((match) => (user: match[1]!, repo: match[2]!)) | ||
.toSet() | ||
.toList(); | ||
|
||
// Skip the reference to the repo itself. | ||
repoPaths.remove((user: 'tortuvshin', repo: 'open-source-flutter-apps')); | ||
|
||
var downloader = Downloader(totalResources: repoPaths.length, concurrency: 5); | ||
for (var (:user, :repo) in repoPaths) { | ||
downloader.cloneGitHubRepo('apps', user, repo); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import 'package:corpus/utils.dart'; | ||
|
||
/// Match URIs that point to GitHub repos. | ||
final _gitHubRepoPattern = | ||
RegExp(r'https://github.com/([a-zA-Z0-9_-]+)/([a-zA-Z0-9_-]+)'); | ||
|
||
/// Download open source apps from itsallwidgets.com. | ||
void main(List<String> arguments) async { | ||
clean("download/widgets"); | ||
|
||
print('Getting page feed...'); | ||
var feed = | ||
await httpGetJson('https://itsallwidgets.com/feed?open_source=true'); | ||
|
||
var repos = <({String user, String repo})>{}; | ||
for (var entry in (feed as List<Object?>)) { | ||
var entryMap = entry as Map<String, Object?>; | ||
if (entryMap['type'] != 'app') continue; | ||
|
||
var repo = entryMap['repo_url'] as String?; | ||
if (repo == null) continue; | ||
|
||
// Only know how to download from GitHub. There are a couple of BitBucket | ||
// ones in there. | ||
if (_gitHubRepoPattern.firstMatch(repo) case var match?) { | ||
repos.add((user: match[1]!, repo: match[2]!)); | ||
} | ||
} | ||
|
||
var downloader = Downloader(totalResources: repos.length, concurrency: 10); | ||
for (var (:user, :repo) in repos) { | ||
downloader.cloneGitHubRepo('widgets', user, repo); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import 'dart:io'; | ||
import 'dart:math'; | ||
|
||
import 'package:args/args.dart'; | ||
import 'package:path/path.dart' as p; | ||
|
||
/// What percentage of files should be copied over. Used to take a random | ||
/// sample of a corpus. | ||
int _samplePercent = 100; | ||
|
||
final _random = Random(); | ||
|
||
const _ignoreDirs = [ | ||
'pkg/dev_compiler/gen/', | ||
'tests/co19/', | ||
'third_party/observatory_pub_packages/', | ||
'tools/sdks/', | ||
'out/', | ||
'xcodebuild/', | ||
|
||
// Redundant stuff in Flutter. | ||
'bin/cache/', | ||
|
||
// Redundant packages that are in the SDK. | ||
'analyzer-', | ||
'compiler_unsupported-', | ||
'dev_compiler-', | ||
]; | ||
|
||
// Note! Assumes the Dart SDK and Flutter repos have been cloned in | ||
// directories next to the corpus repo. Also assumes this script has been run | ||
// from the root directory of this repo. | ||
const _corpora = [ | ||
('apps', 'download/apps'), | ||
('dart', '../../../dart/sdk'), | ||
('flutter', '../../../flutter'), | ||
('pub', 'download/pub'), | ||
('widgets', 'download/widgets'), | ||
]; | ||
|
||
final generatedSuffixes = ['.g.dart', '.freezed.dart']; | ||
|
||
void main(List<String> arguments) async { | ||
var argParser = ArgParser(); | ||
argParser.addFlag('omit-slow'); | ||
argParser.addOption('sample', abbr: 's', defaultsTo: '100'); | ||
|
||
var argResults = argParser.parse(arguments); | ||
_samplePercent = int.parse(argResults['sample']); | ||
|
||
for (var (name, directory) in _corpora) { | ||
if (arguments.contains(name)) await copyDir(directory, name); | ||
} | ||
} | ||
|
||
Future<void> copyDir(String fromDirectory, String toDirectory) async { | ||
// If we're taking a random sample, put that in a separate directory. | ||
if (_samplePercent != 100) { | ||
toDirectory += '-$_samplePercent'; | ||
} | ||
|
||
var i = 0; | ||
var inDir = Directory(fromDirectory); | ||
|
||
await inDir.list(recursive: true, followLinks: false).listen((entry) async { | ||
var relative = p.relative(entry.path, from: inDir.path); | ||
|
||
if (entry is Link) return; | ||
if (entry is! File || !entry.path.endsWith('.dart')) return; | ||
|
||
// Skip redundant stuff. | ||
for (var ignore in _ignoreDirs) { | ||
if (relative.startsWith(ignore)) return; | ||
} | ||
|
||
if (_random.nextInt(100) >= _samplePercent) return; | ||
|
||
// If the path is in a subdirectory starting with '.', ignore it. | ||
var parts = p.split(relative); | ||
if (parts.any((part) => part.startsWith('.'))) return; | ||
|
||
var outPath = p.join('out', toDirectory, relative); | ||
|
||
var outDir = Directory(p.dirname(outPath)); | ||
if (!await outDir.exists()) await outDir.create(recursive: true); | ||
|
||
await entry.copy(outPath); | ||
|
||
i++; | ||
if (i % 100 == 0) print(relative); | ||
}).asFuture(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import 'dart:io'; | ||
|
||
import 'package:corpus/utils.dart'; | ||
|
||
const _totalPackages = 2000; | ||
|
||
void main(List<String> arguments) async { | ||
clean('download/pub'); | ||
|
||
// Iterate through the pages (which are in most recent order) until we get | ||
// enough packages. | ||
var packagePage = 'http://pub.dartlang.org/api/packages'; | ||
var downloaded = 1; | ||
|
||
var downloader = Downloader(totalResources: _totalPackages); | ||
for (;;) { | ||
downloader.log('Getting index page $downloaded...'); | ||
var packages = await httpGetJson(packagePage); | ||
|
||
for (var package in packages['packages']) { | ||
downloader.withResource((logger) async { | ||
var name = package['name'] as String; | ||
var version = package['latest']['version'] as String; | ||
var archiveUrl = package['latest']['archive_url'] as String; | ||
|
||
try { | ||
logger.begin('Downloading $archiveUrl...'); | ||
var archiveBytes = await httpGetBytes(archiveUrl); | ||
var tarFile = 'download/pub/$name-$version.tar.gz'; | ||
await File(tarFile).writeAsBytes(archiveBytes); | ||
|
||
logger.log('Extracting $tarFile...'); | ||
var outputDir = 'download/pub/$name-$version'; | ||
await Directory(outputDir).create(recursive: true); | ||
var result = | ||
await Process.run('tar', ['-xf', tarFile, '-C', outputDir]); | ||
|
||
if (result.exitCode != 0) { | ||
logger.end('Could not extract $tarFile:\n${result.stderr}'); | ||
} else { | ||
await File(tarFile).delete(); | ||
logger.end('Finished $outputDir'); | ||
} | ||
} catch (error) { | ||
logger.end('Error downloading $archiveUrl:\n$error'); | ||
} | ||
}); | ||
|
||
downloaded++; | ||
if (downloaded >= _totalPackages) return; | ||
} | ||
|
||
var nextUrl = packages['next_url']; | ||
if (nextUrl is! String) break; | ||
packagePage = nextUrl; | ||
} | ||
} |
Oops, something went wrong.