From 379d27e3a6656615a7342a8f2cbf01fce8b0375c Mon Sep 17 00:00:00 2001 From: majorli Date: Wed, 12 Apr 2023 07:04:15 +0000 Subject: [PATCH 1/2] update download url of DLRM model dataset link #I6UZGK pevious url not work, update reffering to: https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/ Signed-off-by: majorli --- .../ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh b/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh index 3ba96afdb..2806919f6 100644 --- a/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh +++ b/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh @@ -9,7 +9,7 @@ echo 'download files ...' if [ ! -f "day_0" ];then echo 'download day_0 ...' -curl -O https://storage.googleapis.com/criteo-cail-datasets/day_0.gz +curl -O https://sacriteopcail01.z16.web.core.windows.net/day_0.gz gzip -dk day_0.gz else echo "day_0 has already exist" @@ -17,7 +17,7 @@ fi if [ ! -f "day_1" ];then echo 'download day_1 ...' -curl -O https://storage.googleapis.com/criteo-cail-datasets/day_1.gz +curl -O https://sacriteopcail01.z16.web.core.windows.net/day_1.gz gzip -dk day_1.gz else echo "day_1 has already exist" @@ -25,7 +25,7 @@ fi if [ ! -f "day_2" ];then echo 'download day_2 ...' -curl -O https://storage.googleapis.com/criteo-cail-datasets/day_2.gz +curl -O https://sacriteopcail01.z16.web.core.windows.net/day_2.gz gzip -dk day_2.gz else echo "day_2 has already exist" -- Gitee From ad262dcd2ef7327890b8acfe25f094e9f8f5352f Mon Sep 17 00:00:00 2001 From: majorli Date: Thu, 20 Apr 2023 05:14:04 +0000 Subject: [PATCH 2/2] Bugfix: DLRM dataset download fail due to link not available link #I6UZGK Signed-off-by: majorli --- recommendation/ctr/dlrm/pytorch/README.md | 16 +++++++++++++--- .../pytorch/dlrm/data/download_and_preprocess.sh | 6 +++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/recommendation/ctr/dlrm/pytorch/README.md b/recommendation/ctr/dlrm/pytorch/README.md index 0fe8df516..41d5345fb 100644 --- a/recommendation/ctr/dlrm/pytorch/README.md +++ b/recommendation/ctr/dlrm/pytorch/README.md @@ -15,13 +15,23 @@ pip3 install -r requirements.txt && python3 ./setup.py install Criteo_Terabyte consists of 23 days data, as it is very large, here only take 3 days data for an example. ```shell -# download data +# Check gzip version +gzip -V + +# If gzip version is not 1.6+, you need to install gzip 1.6 +wget https://ftp.gnu.org/gnu/gzip/gzip-1.6.tar.gz +tar -xzf gzip-1.6.tar.gz +cd gzip-1.6 +./configure && make install +cd ../ +rm -rf gzip-1.6.tar.gz gzip-1.6/ + +# Download data cd dlrm/data/ bash download_and_preprocess.sh ``` -After above steps, can get files: terabyte_processed_test.bin, terabyte_processed_train.bin, terabyte_processed_val.bin. - +After above steps, you can get files: terabyte_processed_test.bin, terabyte_processed_train.bin, terabyte_processed_val.bin in "/home/datasets/recommendation/Criteo_Terabyte/". ## Step 3: Training diff --git a/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh b/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh index 2806919f6..ba742d6f4 100644 --- a/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh +++ b/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh @@ -9,7 +9,7 @@ echo 'download files ...' if [ ! -f "day_0" ];then echo 'download day_0 ...' -curl -O https://sacriteopcail01.z16.web.core.windows.net/day_0.gz +wget -c https://sacriteopcail01.z16.web.core.windows.net/day_0.gz gzip -dk day_0.gz else echo "day_0 has already exist" @@ -17,7 +17,7 @@ fi if [ ! -f "day_1" ];then echo 'download day_1 ...' -curl -O https://sacriteopcail01.z16.web.core.windows.net/day_1.gz +wget -c https://sacriteopcail01.z16.web.core.windows.net/day_1.gz gzip -dk day_1.gz else echo "day_1 has already exist" @@ -25,7 +25,7 @@ fi if [ ! -f "day_2" ];then echo 'download day_2 ...' -curl -O https://sacriteopcail01.z16.web.core.windows.net/day_2.gz +wget -c https://sacriteopcail01.z16.web.core.windows.net/day_2.gz gzip -dk day_2.gz else echo "day_2 has already exist" -- Gitee