Просмотр исходного кода

Fix google drive links in classification-results.sh - Issue 193

Summary: Recent changes in google drive require a different mechanism for retrieving the data.

Reviewed By: ajoulin

Differential Revision: D4988508

fbshipit-source-id: 5ad51f450ed3b50de71622c72f4d609319aeb6cf
Christian Puhrsch 8 лет назад
Родитель
Сommit
2c4eabb61a
1 измененных файлов с 19 добавлено и 2 удалено
  1. 19 2
      classification-results.sh

+ 19 - 2
classification-results.sh

@@ -53,12 +53,29 @@ DATADIR=data
 mkdir -p "${RESULTDIR}"
 mkdir -p "${DATADIR}"
 
-for i in {0..7}
+# Small datasets first
+
+for i in {0..0}
+do
+  echo "Downloading dataset ${DATASET[i]}"
+  if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
+  then
+    wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
+    tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
+    cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
+    cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
+  fi
+done
+
+# Large datasets require a bit more work due to the extra request page
+
+for i in {1..7}
 do
   echo "Downloading dataset ${DATASET[i]}"
   if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
   then
-    wget -c "https://googledrive.com/host/${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
+    curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
+    curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
     tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
     cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
     cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"