-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup_cogs.sh
More file actions
38 lines (30 loc) · 1.08 KB
/
setup_cogs.sh
File metadata and controls
38 lines (30 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash -l
# download dataset
echo 'Setting up dataset under ./data.'
TARGET_DIR='cogs/data'
DATADIR=${TARGET_DIR}'/lambda'
mkdir -p $DATADIR
git clone https://github.com/najoungkim/COGS $DATADIR/original
rm -rf ${DATADIR}/original/data/train_100.tsv
# create gen dev set
echo 'Building gen dev set for tuning...'
export GEN_DEV=${DATADIR}'/original/data/gen_dev.tsv'
<${DATADIR}'/original/data/gen.tsv' sort -R | head -n 1000 > ${GEN_DEV}
echo $(wc -l ${GEN_DEV})
# removes rows that contain primitives not in the train set
echo 'Removing rows that contain primitives not in the train set...'
for f in $(ls $DATADIR/original/data/*)
do
echo 'Processing' ${f}
NEW_FILE=${f##*/}
grep -i -v -E "gardner|monastery" $f > $DATADIR/$NEW_FILE
done
# remove original files
rm -rf ${DATADIR}/original
echo 'Done filtering.'
# parse data as COGS graphs
echo 'Parsing COGS logical forms as graphs and building vocabularies...'
python src/utils/parser.py --data_dir ${TARGET_DIR}
echo 'Done building COGS graphs.'
# splits src and tgt sequences
python src/utils/preprocess.py --parent_path $TARGET_DIR