-
Notifications
You must be signed in to change notification settings - Fork 8
/
run_example.sh
executable file
·66 lines (50 loc) · 1.95 KB
/
run_example.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/bash
ALIAS=sample
LANGUAGE=id
### step0:
### install package
# pip install -r requirements.txt
### download resource, including kenlm model and sentenpiece model
# mkdir lm_resource
# python data_cleaning/download_sentencepiece_kenlm_models.py --output_dir_path lm_resource
mkdir -p cache/data_clean_cache
mkdir -p cache/near_dedup_cache
mkdir -p cache/exact_dedup_cache
mkdir -p data/data_input
mkdir -p data/data_output/cleaned_data_output
mkdir -p data/data_output/near_dedup_output
mkdir -p data/data_output/exact_dedup_output
mkdir -p data/data_output/final_output
# ### step1: data-cleaning
bash code/data_cleaning/run_example.sh \
$ALIAS \
data/data_input/$ALIAS.jsonl \
$LANGUAGE \
data/data_output/cleaned_data_output \
lm_resource \
cache/data_clean_cache
### step2: near-dedup
bash code/near_dedup/run_example.sh \
data/data_output/cleaned_data_output/$ALIAS/data_clean.jsonl \
data/data_output/near_dedup_output/$ALIAS \
cache/near_dedup_cache
### step3: exact-dedup
bash code/exact_dedup/run_example.sh \
data/data_output/near_dedup_output/$ALIAS/data_clean.jsonl \
data/data_output/exact_dedup_output/$ALIAS \
$ALIAS \
cache/exact_dedup_cache \
cache/exact_dedup_cache
### step4: data-clean
bash code/data_cleaning/run_example.sh \
$ALIAS \
data/data_output/exact_dedup_output/$ALIAS/data_clean.jsonl \
$LANGUAGE \
data/data_output/final_output \
lm_resource \
cache/data_clean_cache
### step5: output stats
echo "Counting lines in cleaned data output: $(wc -l < data/data_output/cleaned_data_output/$ALIAS/data_clean.jsonl)"
echo "Counting lines in near deduplication output: $(wc -l < data/data_output/near_dedup_output/$ALIAS/data_clean.jsonl)"
echo "Counting lines in exact deduplication output: $(wc -l < data/data_output/exact_dedup_output/$ALIAS/data_clean.jsonl)"
echo "Counting lines in final output: $(wc -l < data/data_output/final_output/$ALIAS/data_clean.jsonl)"