Skip to content

Commit

Permalink
Merge pull request #794 from esythan/gpups
Browse files Browse the repository at this point in the history
gpups
  • Loading branch information
frankwhzhang authored Jun 15, 2022
2 parents ba26394 + 2ccd243 commit b3417af
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 8 deletions.
4 changes: 2 additions & 2 deletions models/rank/dnn/config_gpubox.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ runner:
model_save_path: "output_model_dnn_queue"

sync_mode: "gpubox"
thread_num: 16
thread_num: 30
reader_type: "InmemoryDataset" # DataLoader / QueueDataset / RecDataset / InmemoryDataset
pipe_command: "python3.7 models/rank/dnn/queuedataset_reader.py"
dataset_debug: False
Expand All @@ -49,7 +49,7 @@ hyper_parameters:
# user-defined <key, value> pairs
sparse_inputs_slots: 27
sparse_feature_number: 1024
sparse_feature_dim: 11
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
distributed_embedding: 0
6 changes: 4 additions & 2 deletions models/rank/dnn/net.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,19 @@ def __init__(self,
self.add_sublayer('act_%d' % i, act)
self._mlp_layers.append(act)

def forward(self, sparse_inputs, dense_inputs):
def forward(self, sparse_inputs, dense_inputs, show_click=None):

sparse_embs = []
for s_input in sparse_inputs:
if self.sync_mode == "gpubox":
emb = paddle.fluid.contrib.sparse_embedding(
input=s_input,
size=[
self.sparse_feature_number, self.sparse_feature_dim
self.sparse_feature_number, self.sparse_feature_dim + 2
],
param_attr=paddle.ParamAttr(name="embedding"))
emb = paddle.fluid.layers.continuous_value_model(
emb, show_click, False)
else:
emb = self.embedding(s_input)
emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
Expand Down
17 changes: 15 additions & 2 deletions models/rank/dnn/static_model.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,15 @@ def create_feeds(self, is_infer=False):
shape=[None, self.dense_input_dim],
dtype="float32")

# sparse_input_ids = [
# paddle.static.data(
# name="C" + str(i), shape=[None, 1], dtype="int64")
# for i in range(1, self.sparse_inputs_slots)
# ]

sparse_input_ids = [
paddle.static.data(
name="C" + str(i), shape=[None, 1], dtype="int64")
name=str(i), shape=[None, 1], dtype="int64")
for i in range(1, self.sparse_inputs_slots)
]

Expand All @@ -77,8 +83,15 @@ def net(self, input, is_infer=False):
self.fc_sizes,
sync_mode=self.sync_mode)

self.cast_label = paddle.cast(self.label_input, dtype='float32')
ones = paddle.fluid.layers.fill_constant_batch_size_like(
input=self.label_input, shape=[-1, 1], dtype="float32", value=1)
show_click = paddle.cast(
paddle.concat(
[ones, self.cast_label], axis=1), dtype='float32')
show_click.stop_gradient = True
raw_predict_2d = dnn_model.forward(self.sparse_inputs,
self.dense_input)
self.dense_input, show_click)

predict_2d = paddle.nn.functional.softmax(raw_predict_2d)

Expand Down
6 changes: 4 additions & 2 deletions tools/static_gpubox_trainer.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,10 @@ def run_worker(self):
gpus_env = os.getenv("FLAGS_selected_gpus")
self.PSGPU = paddle.fluid.core.PSGPU()
gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)]
gpu_mf_sizes = [self.model.sparse_feature_dim - 1] * (
self.model.sparse_inputs_slots - 1)
self.PSGPU.set_slot_vector(gpuslot)
self.PSGPU.set_slot_dim_vector(gpu_mf_sizes)
self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")])
opt_info = paddle.fluid.default_main_program()._fleet_opt
if use_auc is True:
Expand All @@ -139,7 +142,6 @@ def run_worker(self):
if sync_mode == "heter":
self.heter_train_loop(epoch)
elif sync_mode == "gpubox":
self.reader._set_use_ps_gpu(1)
self.dataset_train_loop(epoch)
elif reader_type == "QueueDataset":
self.dataset_train_loop(epoch)
Expand Down Expand Up @@ -171,6 +173,7 @@ def run_worker(self):
"Epoch: {}, using time {} second, ips {} {}/sec.".format(
epoch, epoch_time, epoch_speed, self.count_method))
self.train_result_dict["speed"].append(epoch_speed)
self.PSGPU.end_pass()

model_dir = "{}/{}".format(save_model_path, epoch)
if fleet.is_first_worker(
Expand All @@ -181,7 +184,6 @@ def run_worker(self):
self.inference_target_var)
fleet.barrier_worker()
self.reader.release_memory()
self.PSGPU.end_pass()
logger.info("finish {} epoch training....".format(epoch))
self.PSGPU.finalize()

Expand Down
2 changes: 2 additions & 0 deletions tools/utils/static_ps/reader_helper.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,12 @@ def __init__(self, input_var, file_list, config):
self.fs_name = self.config.get("runner.fs_name", "")
self.fs_ugi = self.config.get("runner.fs_ugi", "")
print("hdfs config:", self.fs_name, self.fs_ugi)
self.use_gpu = self.config.get("runner.use_gpu", False)

def get_reader(self):
logger.info("Get InmemoryDataset")
dataset = paddle.distributed.InMemoryDataset()
dataset._set_use_ps_gpu(self.use_gpu)
dataset.init(
use_var=self.input_var,
pipe_command=self.pipe_command,
Expand Down

0 comments on commit b3417af

Please sign in to comment.