-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathMakefile
263 lines (220 loc) · 8.91 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#
# Experiment management for Tgen
#
# Environment settings
SHELL = /bin/bash
TGEN = ../run_tgen.py
MEM = 16g
CPUS = 8
QP = --queue=cpu-troja.q
ifdef LOCAL
QP = --engine=console
endif
QSUBMIT = qsubmit --logdir '$(TRY_DIR)' --mem $(MEM) --cores $(CPUS) --jobname T.$(TRY_NUM)$(SEED).$(RUN_NAME) $(QP) $(QHOLD) --
ACTIVATE = echo -n
ifdef DEBUG
DEBUG_LOG_TRAIN = -d $(TRY_DIR)/train.debug-log.txt.gz
DEBUG_LOG_GEN = -d $(TRY_DIR)/gen.debug-log.txt
endif
ifdef DEBUG_GEN
DEBUG_LOG_GEN = -d $(TRY_DIR)/gen.debug-log.txt
endif
METRICS = $$HOME/e2e-metrics/measure_scores.py
SLOT_ERR = $$HOME/e2e-cleaning/slot_error.py
POSTPROCESS = ./postprocess/postprocess.py
AVERAGE_SCORES = ../util/average_scores.pl --e2e
GREP_SCORES = ../util/grep_scores.pl --bleu-range=50:70 --nist-range=6:9 --meteor-range=30:45 --rouge-range=55:70 --cider-range=1:2.5
DESC_EXP = ../util/describe_experiment.py
CFG_OVERRIDE_TOOL = ../util/cfg_override.py
NOWRAP = ../util/nowrap.pl
NUM_EXP = 20 # number of experiments to list in `make desc'
LOGFILES = $$file/T.* $$file/{asearch_gen.,gen.,seq2seq_gen.,}log.txt # logfile pattern to use for score printing
SCOREFILE = SCORE
# Runs directories
RUNS_DIR := runs# main directory for experiment outputs
TRY_NUM := $(shell perl -e '$$m=0; for(<$(RUNS_DIR)/*>){/\/(\d+)_/ and $$1 > $$m and $$m=$$1;} printf "%03d", $$m+1;')# experiment number
RUN_NAME := experiment# default name, to be overridden by targets
DATE := $(shell date +%Y-%m-%d_%H-%M-%S)
TRY_DIR = $(RUNS_DIR)/$(TRY_NUM)_$(DATE)_$(RUN_NAME)# experiment output directory
TRAINING_SET := training-delex_v2
ifdef D # Shortcut D -> DESC
DESC := $(D)
endif
ifdef J # Shortcut J -> JOBS
JOBS := $(J)
endif
ifdef R # Shortcut R -> REFRESH
REFRESH := $(R)
endif
ifdef N # Shortcut R -> REFRESH
NUM_EXP := $(N)
endif
ifdef JOBS
PARALLEL := -j $(JOBS) -e "T.$(TRY_NUM)"
endif
# Input data file defaults
SEQ2SEQ_CONFIG = config/config.yaml
TRAIN_PORTION = 1.0
TRAIN_DAS = data/$(TRAINING_SET)/train-das.txt
TRAIN_TEXTS = data/$(TRAINING_SET)/train-text.txt
TRAIN_CONCDAS = data/$(TRAINING_SET)/train-conc_das.txt
TRAIN_CONC = data/$(TRAINING_SET)/train-conc.txt
TEST_DAS = data/$(TRAINING_SET)/devel-das.txt
TEST_TEXTS = data/$(TRAINING_SET)/devel-text.txt
TEST_CONCDAS = data/$(TRAINING_SET)/devel-conc_das.txt
TEST_CONC = data/$(TRAINING_SET)/devel-conc.txt
ifdef EVAL_TEST
TEST_DAS = data/$(TRAINING_SET)/test-das.txt
TEST_TEXTS = data/$(TRAINING_SET)/test-text.txt
TEST_CONCDAS = data/$(TRAINING_SET)/test-conc_das.txt
TEST_CONC = data/$(TRAINING_SET)/test-conc.txt
VALID_CONCDAS = data/$(TRAINING_SET)/devel-conc_das.txt
VALID_CONC = data/$(TRAINING_SET)/devel-conc.txt
VALID_DAS = data/$(TRAINING_SET)/devel-das.txt
VALID_TEXTS = data/$(TRAINING_SET)/devel-text.txt
endif
# Help text
define HELP_MESSAGE
Tgen experiments
================
make desc [NUM_EXP=20] [REFRESH=1]
* List experiments with descriptions and (possibly) scores.
* Change NUM_EXP if you need to see more than 20 latest results.
* Refresh scores to obtain new results by setting REFRESH=1 (makes it slower).
make seq2seq_cv_run J=[XX] D='Description'
* Run an experiment where seq2seq
training is directly followed by testing generation.
- set RANDS=1 to use 5 different random seeds (and average the results)
- set EVAL_TEST=1 to evaluate on test data instead of development data
make score-XXX
* Retrieve a score for run without RANDS.
make cv_score-XXX
* Retrieve a score for run with RANDS.
---
Use QP='--console' if you want the job to run interactively instead
of submitting to the cluster.
Use ACTIVATE='source path/to/virtualenv/bin/activate' to run
within a Virtualenv instead of the default Python.
Use DEBUG=1 to activate debugging outputs. Use DEBUG_GEN=1 to activate
debugging only for testing (in CV runs).
endef
export HELP_MESSAGE
#
# Targets
#
# Auxiliary targets
help:
@echo "$$HELP_MESSAGE" | egrep --color '^(\s*make.*|)'
# List all experiments, find scores in logs (of various versions)
desc:
@COLS=`tput cols`; \
ls -d $(RUNS_DIR)/* | sort | tail -n $(NUM_EXP) | while read file; do \
if [[ -f $$file/$(SCOREFILE) && -z "$(REFRESH)" ]]; then \
cat $$file/$(SCOREFILE) | $(NOWRAP) $$COLS; \
continue ; \
fi ; \
echo -ne $$file ": " | sed 's/runs\///;s/_/ /;s/_/ /;s/_/ /;s/ 20/ /;s/-[0-9][0-9] / /;' > $$file/$(SCOREFILE); \
$(GREP_SCORES) $(LOGFILES) >> $$file/$(SCOREFILE) ; \
echo -ne ' ' >> $$file/$(SCOREFILE) ; \
cat $$file/ABOUT | tr '\n' ' ' >> $$file/$(SCOREFILE); \
echo >> $$file/$(SCOREFILE); \
cat $$file/$(SCOREFILE) | $(NOWRAP) $$COLS; \
done
shortdesc:
make desc NUM_EXP=$(NUM_EXP) | sed 's/\s\+.*F ..\...\s*/ /;s/\s\+.*cv_run :\s*/ /;s/E50.*+sort//;s/+cf_rnn.*128//;'
printvars:
$(foreach V, $(sort $(.VARIABLES)), $(if $(filter-out environment% default automatic, $(origin $V)), $(info $V=$($V) ($(value $V)))))
printgit:
@git status
@echo -e "\n*** *** ***\n"
@git log --pretty=format:"%h - %an, %ar : %s" -1
@echo -e "\n*** *** ***\n"
@git diff
prepare_dir:
# create the experiment directory, print description and git version
mkdir -p $(TRY_DIR)
cp $(SEQ2SEQ_CONFIG) $(TRY_DIR)/config.yaml
if [[ -n '$(CFG_OVERRIDE)' ]]; then \
$(CFG_OVERRIDE_TOOL) $(TRY_DIR)/config.yaml '$(CFG_OVERRIDE)' ; \
fi;
[[ -n "$(DEBUG)" || -n "$(DEBUG_GEN)" ]] && DEBUG_SETTING=-d ; \
[[ -n "$(RANDS)" ]] && RANDS_SETTING="-r $(RANDS)"; \
[[ -n "$(EVAL_TEST)" ]] && EVAL_SETTING=-e ; \
$(DESC_EXP) -t "$(TRAINING_SET)" -p "$(TRAIN_PORTION)" $$EVAL_SETTING $$DEBUG_SETTING $$RANDS_SETTING $(TRY_DIR)/config.yaml > $(TRY_DIR)/ABOUT ; \
echo "$(DESC)" >> $(TRY_DIR)/ABOUT
make printvars > $(TRY_DIR)/VARS
make printgit > $(TRY_DIR)/GIT_VERSION
# Main targets
seq2seq_cv_run: cv_run
cv_run: RUN_NAME := cv_run
cv_run: prepare_dir
cv_run:
if [[ -n "$(RANDS)" ]]; then \
SEEDS=(s0 s1 s2 s3 s4); \
else \
SEEDS=(""); \
fi; \
for SEED in "$${SEEDS[@]}"; do \
TRAIN_RUN=seq2seq_train ; \
TEST_RUN=seq2seq_gen_cv ; \
make $$TRAIN_RUN TRY_DIR=$(TRY_DIR)/$$SEED SEED=$$SEED TRY_NUM=$(TRY_NUM) CFG_OVERRIDE='$(CFG_OVERRIDE)' 2>&1 | tee -a $(TRY_DIR)/cv.log.txt ; \
JOB_NUM=`cat $(TRY_DIR)/cv.log.txt | grep '^\(Your job\|Submitted batch\|Job_ID\)' | tail -n 1 | sed 's/^\(Your job\|Submitted batch job\|Job_ID:\) \([0-9]\+\).*/\2/;' ` ; \
make $$TEST_RUN TRY_DIR=$(TRY_DIR)/$$SEED SEED=$$SEED TRY_NUM=$(TRY_NUM) QP="$(QP)" QHOLD="--hold $$JOB_NUM"; \
done
cv_score-%:
$(eval TRY_DIR := $(shell ls -d $(RUNS_DIR)/$**))
$(eval TRY_NUM := $*)
make cv_score TRY_DIR=$(TRY_DIR) TRY_NUM=$(TRY_NUM)
cv_score:
$(AVERAGE_SCORES) $(TRY_DIR)/*/T.*.*_gen.o* | tee $(TRY_DIR)/gen.log.txt
score-%:
$(eval TRY_DIR := $(shell ls -d $(RUNS_DIR)/$**))
$(eval TRY_NUM := $*)
make score TRY_DIR=$(TRY_DIR) TRY_NUM=$(TRY_NUM)
score:
$(AVERAGE_SCORES) $(TRY_DIR)/T.*.*_gen.o* | tee $(TRY_DIR)/gen.log.txt
seq2seq_train: RUN_NAME := seq2seq_train
seq2seq_train: prepare_dir
seq2seq_train:
# copy needed files (to ensure replication), then run the experiment
cp $(TRAIN_DAS) $(TRY_DIR)/train-das.txt
cp $(TRAIN_TEXTS) $(TRY_DIR)/train-text.txt
cp $(TRAIN_CONCDAS) $(TRY_DIR)/train-conc_das.txt
if [[ -n "$(EVAL_TEST)" ]]; then \
cp $(VALID_CONCDAS) $(TRY_DIR)/devel-conc_das.txt ; \
cp $(VALID_DAS) $(TRY_DIR)/devel-das.txt ; \
cp $(VALID_TEXTS) $(TRY_DIR)/devel-text.txt ; \
VALID_DATA_SPEC="-v \"$(TRY_DIR)/devel-das.txt,$(TRY_DIR)/devel-text.txt\""; \
fi; \
$(QSUBMIT) '$(ACTIVATE); \
$(TGEN) seq2seq_train $(DEBUG_LOG_TRAIN) \
-s $(TRAIN_PORTION) $(PARALLEL) -r "$(SEED)" '"$$VALID_DATA_SPEC"' \
$(TRY_DIR)/config.yaml $(TRY_DIR)/train-das.txt $(TRY_DIR)/train-text.txt \
$(TRY_DIR)/seq2seq.pickle.gz' 2>&1 | tee $(TRY_DIR)/seq2seq_train.log.txt
seq2seq_gen_cv:
make prepare_test_data TRY_DIR=$(TRY_DIR)
make seq2seq_gen_process TRY_DIR=$(TRY_DIR) SEED=$(SEED) TRY_NUM=$(TRY_NUM) QP="$(QP)" QHOLD="$(QHOLD)"
prepare_test_data:
cp $(TEST_DAS) $(TRY_DIR)/test-das.txt
cp $(TEST_TEXTS) $(TRY_DIR)/test-text.txt
cp $(TEST_CONC) $(TRY_DIR)/test-conc.txt
cp $(TEST_CONCDAS) $(TRY_DIR)/test-conc_das.txt
seq2seq_gen_process: RUN_NAME := seq2seq_gen
seq2seq_gen_process: GEN_MODEL := $(TRY_DIR)/seq2seq.pickle.gz
seq2seq_gen_process: gen_process
gen_process:
# run
$(QSUBMIT) '$(ACTIVATE); \
$(TGEN) $(RUN_NAME) -e $(TRY_DIR)/test-text.txt $(DEBUG_LOG_GEN) \
-w $(TRY_DIR)/out-text.txt $(GEN_CONFIG) -a $(TRY_DIR)/test-conc_das.txt \
$(GEN_MODEL) $(TRY_DIR)/test-das.txt; \
$(POSTPROCESS) $(TRY_DIR)/out-text.txt $(TRY_DIR)/out-detok.txt; \
$(METRICS) $(TRY_DIR)/test-conc.txt $(TRY_DIR)/out-detok.txt; \
$(SLOT_ERR) --mrs $(TRY_DIR)/test-conc_das.txt $(TRY_DIR)/out-detok.txt' \
2>&1 | tee $(TRY_DIR)/$(RUN_NAME).log.txt
# Rerun an experiment (TRY_DIR, TRY_NUM, RUN_NAME must be set!)
rerun:
LAST_LOGFILE=`ls -t $(TRY_DIR)/T.*.$(RUN_NAME).* | head -n 1` ; \
COMMAND=`cat $$LAST_LOGFILE | sed '1,/^== Directory/d;/^== Hard res/,$$d;s/^== Command:\s*//;'` ; \
echo "Rerunning command: $$COMMAND"; \
$(QSUBMIT) "$$COMMAND" 2>&1 | tee -a $(TRY_DIR)/$(RUN_NAME).log.txt ;