|
|
@@ -53,12 +53,29 @@ DATADIR=data
|
|
|
mkdir -p "${RESULTDIR}"
|
|
|
mkdir -p "${DATADIR}"
|
|
|
|
|
|
-for i in {0..7}
|
|
|
+# Small datasets first
|
|
|
+
|
|
|
+for i in {0..0}
|
|
|
+do
|
|
|
+ echo "Downloading dataset ${DATASET[i]}"
|
|
|
+ if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
|
|
|
+ then
|
|
|
+ wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
|
|
+ tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
|
|
|
+ cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
|
|
|
+ cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
|
|
|
+ fi
|
|
|
+done
|
|
|
+
|
|
|
+# Large datasets require a bit more work due to the extra request page
|
|
|
+
|
|
|
+for i in {1..7}
|
|
|
do
|
|
|
echo "Downloading dataset ${DATASET[i]}"
|
|
|
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
|
|
|
then
|
|
|
- wget -c "https://googledrive.com/host/${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
|
|
+ curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
|
|
|
+ curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
|
|
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
|
|
|
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
|
|
|
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
|