From b8f4ca1ba29762d6ab506766238fdcd4de24540c Mon Sep 17 00:00:00 2001 From: "hui.sang" Date: Fri, 11 Nov 2022 12:09:14 +0800 Subject: [PATCH 1/2] dlrm model add checkpoint lind #I60L07 dlrm model add checkpoint Signed-off-by: hui.sang --- .../ctr/dlrm/pytorch/dlrm/dist_model.py | 24 +++++++++++++++++++ .../ctr/dlrm/pytorch/scripts/train.py | 17 ++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py b/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py index 467fb9ba2..083257d8d 100644 --- a/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py +++ b/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py @@ -1,3 +1,20 @@ + +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + """Distributed version of DLRM model In order to code the hybrid decomposition, the model code needs to be restructured. I don't know a clean enough @@ -427,3 +444,10 @@ class DistDlrm(): def to(self, *args, **kwargs): self.bottom_model.to(*args, **kwargs) self.top_model.to(*args, **kwargs) + + def state_dict(self): + dlrm_state_dic = {} + dlrm_state_dic.update(self.bottom_model.state_dict()) + dlrm_state_dic.update(self.top_model.state_dict()) + + return dlrm_state_dic diff --git a/recommendation/ctr/dlrm/pytorch/scripts/train.py b/recommendation/ctr/dlrm/pytorch/scripts/train.py index 24eed49c7..3a57660fb 100644 --- a/recommendation/ctr/dlrm/pytorch/scripts/train.py +++ b/recommendation/ctr/dlrm/pytorch/scripts/train.py @@ -1,3 +1,18 @@ +# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + """Reference training script Only Criteo data is supported at the moment, one hot embedding. """ @@ -53,7 +68,7 @@ flags.DEFINE_enum("dataset_type", "memmap", ["bin", "memmap", "dist"], "Which da flags.DEFINE_boolean("use_embedding_ext", True, "Use embedding cuda extension. If False, use Pytorch embedding") # Saving and logging flags -flags.DEFINE_string("output_dir", "/tmp", "path where to save") +flags.DEFINE_string("output_dir", ".", "path where to save") flags.DEFINE_integer("test_freq", None, "#steps test. If None, 20 tests per epoch per MLperf rule.") flags.DEFINE_float("test_after", 0, "Don't test the model unless this many epochs has been completed") flags.DEFINE_integer("print_freq", None, "#steps per pring") -- Gitee From c7985e5d4cd9e9d644f3332b882bf999eeba0d19 Mon Sep 17 00:00:00 2001 From: "hui.sang" Date: Fri, 11 Nov 2022 13:37:00 +0800 Subject: [PATCH 2/2] dlrm model add checkpoint lind #I60L07 dlrm model add checkpoint Signed-off-by: hui.sang --- recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py b/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py index 083257d8d..de79a510a 100644 --- a/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py +++ b/recommendation/ctr/dlrm/pytorch/dlrm/dist_model.py @@ -1,4 +1,3 @@ - # Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # -- Gitee