-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessing_lingproc.mk
95 lines (78 loc) · 3.55 KB
/
processing_lingproc.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
$(call log.debug, COOKBOOK BEGIN INCLUDE: cookbook/processing_lingproc.mk)
###############################################################################
# LINGUISTIC PROCESSING TARGETS
# Targets for processing newspaper content with linguistic analysis
###############################################################################
# DOUBLE-COLON-TARGET: lingproc-target
processing-target :: lingproc-target
# USER-VARIABLE: LINGPROC_VALIDATE_OPTION
# Option to enable schema validation of the output
#
# Set to no value or $(EMPTY) for preventing JSON schema validation
# LINGPROC_VALIDATE_OPTION ?= $(EMPTY)
LINGPROC_VALIDATE_OPTION ?= --validate
$(call log.debug, LINGPROC_VALIDATE_OPTION)
# USER-VARIABLE: LINGPROC_QUIET_OPTION
# Reserved for quiet processing mode (@TODO: Implement in script)
LINGPROC_QUIET_OPTION ?=
$(call log.debug, LINGPROC_QUIET_OPTION)
# VARIABLE: LOCAL_REBUILT_STAMP_FILES
# Stores all locally available rebuilt stamp files for dependency tracking
LOCAL_REBUILT_STAMP_FILES := \
$(shell ls -r $(LOCAL_PATH_REBUILT)/*.jsonl.bz2$(LOCAL_REBUILT_STAMP_SUFFIX) 2> /dev/null \
| $(if $(NEWSPAPER_YEAR_SORTING),$(NEWSPAPER_YEAR_SORTING),cat))
$(call log.debug, LOCAL_REBUILT_STAMP_FILES)
# FUNCTION: LocalRebuiltToLingprocFile
# Converts a local rebuilt file name to a local linguistic processing file name
define LocalRebuiltToLingprocFile
$(1:$(LOCAL_PATH_REBUILT)/%.jsonl.bz2$(LOCAL_REBUILT_STAMP_SUFFIX)=$(LOCAL_PATH_LINGPROC)/%.jsonl.bz2)
endef
# VARIABLE: LOCAL_LINGPROC_FILES
# Stores the list of linguistic processing files based on rebuilt stamp files
LOCAL_LINGPROC_FILES := \
$(call LocalRebuiltToLingprocFile,$(LOCAL_REBUILT_STAMP_FILES))
$(call log.debug, LOCAL_LINGPROC_FILES)
# TARGET: lingproc-target
#: Processes newspaper content with linguistic analysis
#
# Just uses the local data that is there, does not enforce synchronization
lingproc-target: $(LOCAL_LINGPROC_FILES)
PHONY_TARGETS += lingproc-target
# FILE-RULE: $(LOCAL_PATH_LINGPROC)/%.jsonl.bz2
#: Rule to process a single newspaper
#
# Note: Unsets errexit flag to communicate exit codes
$(LOCAL_PATH_LINGPROC)/%.jsonl.bz2: $(LOCAL_PATH_REBUILT)/%.jsonl.bz2$(LOCAL_REBUILT_STAMP_SUFFIX) $(LOCAL_PATH_LANGIDENT)/%.jsonl.bz2
$(MAKE_SILENCE_RECIPE) \
mkdir -p $(@D) && \
{ set +e ; \
python3 lib/spacy_linguistic_processing.py \
$(call LocalToS3,$<,$(LOCAL_REBUILT_STAMP_SUFFIX)) \
--lid $(call LocalToS3,$(word 2,$^),'') \
$(LINGPROC_VALIDATE_OPTION) \
--s3-output-path $(call LocalToS3,$@,.'') \
$(PROCESSING_KEEP_TIMESTAMP_ONLY_OPTION) \
$(PROCESSING_QUIT_IF_S3_OUTPUT_EXISTS_OPTION) \
$(PROCESSING_S3_OUTPUT_DRY_RUN) \
$(LINGPROC_QUIET_OPTION) \
--git-version $(GIT_VERSION) \
-o $@ \
--log-file [email protected] ; \
EXIT_CODE=$$? ; \
echo "Processing exit code: $$EXIT_CODE" ; \
if [ $$EXIT_CODE -eq 0 ] ; then \
echo "Processing completed successfully. Uploading logfile..." ; \
python3 lib/s3_to_local_stamps.py \
$(call LocalToS3,$@,.stamp).log.gz \
--upload-file [email protected] \
--force-overwrite ; \
elif [ $$EXIT_CODE -eq 3 ] ; then \
echo "Processing skipped (output exists on S3). Not uploading logfile." ; \
rm -f $@ ; \
exit 0 ; \
else \
echo "An error occurred during processing. Exit code: $$EXIT_CODE" ; \
rm -f $@ ; \
exit $$EXIT_CODE ; \
fi ; }
$(call log.debug, COOKBOOK END INCLUDE: cookbook/processing_lingproc.mk)