diff --git a/recommendation/ctr/dlrm/pytorch/README.md b/recommendation/ctr/dlrm/pytorch/README.md index 0fe8df516bb2759e00085d6153675e1e1f621399..41d5345fb6b7383c5f227f7705f4b11482a64fbd 100644 --- a/recommendation/ctr/dlrm/pytorch/README.md +++ b/recommendation/ctr/dlrm/pytorch/README.md @@ -15,13 +15,23 @@ pip3 install -r requirements.txt && python3 ./setup.py install Criteo_Terabyte consists of 23 days data, as it is very large, here only take 3 days data for an example. ```shell -# download data +# Check gzip version +gzip -V + +# If gzip version is not 1.6+, you need to install gzip 1.6 +wget https://ftp.gnu.org/gnu/gzip/gzip-1.6.tar.gz +tar -xzf gzip-1.6.tar.gz +cd gzip-1.6 +./configure && make install +cd ../ +rm -rf gzip-1.6.tar.gz gzip-1.6/ + +# Download data cd dlrm/data/ bash download_and_preprocess.sh ``` -After above steps, can get files: terabyte_processed_test.bin, terabyte_processed_train.bin, terabyte_processed_val.bin. - +After above steps, you can get files: terabyte_processed_test.bin, terabyte_processed_train.bin, terabyte_processed_val.bin in "/home/datasets/recommendation/Criteo_Terabyte/". ## Step 3: Training diff --git a/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh b/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh index 3ba96afdb4c5321ba9b4cdb59ed792debf3fda10..ba742d6f4813d2ee8c25f12d467eb09732e2dc78 100644 --- a/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh +++ b/recommendation/ctr/dlrm/pytorch/dlrm/data/download_and_preprocess.sh @@ -9,7 +9,7 @@ echo 'download files ...' if [ ! -f "day_0" ];then echo 'download day_0 ...' -curl -O https://storage.googleapis.com/criteo-cail-datasets/day_0.gz +wget -c https://sacriteopcail01.z16.web.core.windows.net/day_0.gz gzip -dk day_0.gz else echo "day_0 has already exist" @@ -17,7 +17,7 @@ fi if [ ! -f "day_1" ];then echo 'download day_1 ...' -curl -O https://storage.googleapis.com/criteo-cail-datasets/day_1.gz +wget -c https://sacriteopcail01.z16.web.core.windows.net/day_1.gz gzip -dk day_1.gz else echo "day_1 has already exist" @@ -25,7 +25,7 @@ fi if [ ! -f "day_2" ];then echo 'download day_2 ...' -curl -O https://storage.googleapis.com/criteo-cail-datasets/day_2.gz +wget -c https://sacriteopcail01.z16.web.core.windows.net/day_2.gz gzip -dk day_2.gz else echo "day_2 has already exist"