Skip to content

Commit

Permalink
Merge remote-tracking branch 'up/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
rohithkrn committed Aug 13, 2019
2 parents 91110ca + f60ae9c commit 98e2af1
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 242 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,18 @@ Benchmarking
*NOTE: Benchmarking scripts accept extra arguments which will passed along, such as --num-batches=100 to limit the number of data samples*
Model checkpoint saving/loading
-------------------------------------------------
During training, the model can be saved using --save-model=<path/model.pt>
The model is saved if there is an improvement in test accuracy (which is checked at --test-freq intervals).
A previously saved model can be loaded using --load-model=<path/model.pt>
Once loaded the model can be used to continue training, with the saved model being a checkpoint.
Alternatively, the saved model can be used to evaluate only on the test data-set by specifying --inference-only option.
Version
-------
0.1 : Initial release of the DLRM code
Expand Down
4 changes: 1 addition & 3 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@ def processKaggleCriteoAdData(split, d_path):
break

# process data if not all files exist
if idx < split + 1:

# process data
if idx <= split:
for i in range(1, split + 1):
with np.load(str(d_path) + "kaggle_day_{0}.npz".format(i)) as data:

Expand Down
182 changes: 93 additions & 89 deletions dlrm_data_caffe2.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def read_dataset(
split=True,
raw_data="",
processed_data="",
inference_only=False,
):
# load
print("Loading %s dataset..." % dataset)
Expand Down Expand Up @@ -68,63 +69,66 @@ def read_dataset(
print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))

# adjust parameters
lX = []
lS = []
lS_lengths = []
lS_indices = []
lT = []
train_nsamples = len(y_train)
data_size = train_nsamples
nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
print("Training data")
if num_batches != 0 and num_batches < nbatches:
print(
"Limiting to %d batches of the total % d batches" % (num_batches, nbatches)
)
nbatches = num_batches
else:
print("Total number of batches %d" % nbatches)

# training data main loop
for j in range(0, nbatches):
# number of data points in a batch
print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
n = min(mini_batch_size, data_size - (j * mini_batch_size))
# dense feature
idx_start = j * mini_batch_size
# WARNING: X_int_train is a PyTorch tensor
lX.append((X_int_train[idx_start : (idx_start + n)]).numpy().astype(np.float32))
# Training targets - outputs
# WARNING: y_train is a PyTorch tensor
lT.append(
(y_train[idx_start : idx_start + n])
.numpy()
.reshape(-1, 1)
.astype(np.int32)
)
# sparse feature (sparse indices)
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in range(n_emb):
lS_batch_indices = []
for _b in range(n):
# num of sparse indices to be used per embedding, e.g. for
# store lengths and indices
lS_batch_indices += (
(X_cat_train[idx_start + _b][size].view(-1))
.numpy()
.astype(np.int32)
).tolist()
lS_emb_indices.append(lS_batch_indices)
lS_indices.append(lS_emb_indices)
# Criteo Kaggle data it is 1 because data is categorical
lS_lengths.append([(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)])

lS = lS_indices.copy()
if not inference_only:
lX = []
lS_lengths = []
lS_indices = []
lT = []
train_nsamples = len(y_train)
data_size = train_nsamples
nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
print("Training data")
if num_batches != 0 and num_batches < nbatches:
print(
"Limiting to %d batches of the total % d batches"
% (num_batches, nbatches)
)
nbatches = num_batches
else:
print("Total number of batches %d" % nbatches)

# training data main loop
for j in range(0, nbatches):
# number of data points in a batch
print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
n = min(mini_batch_size, data_size - (j * mini_batch_size))
# dense feature
idx_start = j * mini_batch_size
# WARNING: X_int_train is a PyTorch tensor
lX.append(
(X_int_train[idx_start : (idx_start + n)]).numpy().astype(np.float32)
)
# Training targets - outputs
# WARNING: y_train is a PyTorch tensor
lT.append(
(y_train[idx_start : idx_start + n])
.numpy()
.reshape(-1, 1)
.astype(np.int32)
)
# sparse feature (sparse indices)
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in range(n_emb):
lS_batch_indices = []
for _b in range(n):
# num of sparse indices to be used per embedding, e.g. for
# store lengths and indices
lS_batch_indices += (
(X_cat_train[idx_start + _b][size].view(-1))
.numpy()
.astype(np.int32)
).tolist()
lS_emb_indices.append(lS_batch_indices)
lS_indices.append(lS_emb_indices)
# Criteo Kaggle data it is 1 because data is categorical
lS_lengths.append(
[(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
)
print("\n")

# adjust parameters
print("\n")
lX_test = []
lS_lengths_test = []
lS_indices_test = []
Expand Down Expand Up @@ -154,10 +158,7 @@ def read_dataset(
# Training targets - outputs
# WARNING: y_train is a PyTorch tensor
lT.append(
(y_test[idx_start : idx_start + n])
.numpy()
.reshape(-1, 1)
.astype(np.int32)
(y_test[idx_start : idx_start + n]).numpy().reshape(-1, 1).astype(np.int32)
)
# sparse feature (sparse indices)
lS_emb_indices = []
Expand All @@ -178,21 +179,36 @@ def read_dataset(
[(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
)

return (
nbatches,
lX,
lS,
lS_lengths,
lS_indices,
lT,
nbatches_test,
lX_test,
lS_lengths_test,
lS_indices_test,
lT_test,
ln_emb,
m_den,
)
if not inference_only:
return (
nbatches,
lX,
lS_lengths,
lS_indices,
lT,
nbatches_test,
lX_test,
lS_lengths_test,
lS_indices_test,
lT_test,
ln_emb,
m_den,
)
else:
return (
nbatches_test,
lX_test,
lS_lengths_test,
lS_indices_test,
lT_test,
None,
None,
None,
None,
None,
ln_emb,
m_den,
)


# uniform ditribution (input data)
Expand All @@ -214,7 +230,6 @@ def generate_random_input_data(

# inputs and targets
lX = []
lS = []
lS_lengths = []
lS_indices = []
for j in range(0, nbatches):
Expand All @@ -224,13 +239,11 @@ def generate_random_input_data(
Xt = ra.rand(n, m_den).astype(np.float32)
lX.append(Xt)
# sparse feature (sparse indices)
lS_emb = []
lS_emb_lengths = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in ln_emb:
lS_batch = []
lS_batch_lengths = []
lS_batch_indices = []
for _ in range(n):
Expand All @@ -249,17 +262,14 @@ def generate_random_input_data(
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int32(sparse_group.size)
# store lengths and indices
lS_batch.append(sparse_group.tolist())
lS_batch_lengths += [sparse_group_size]
lS_batch_indices += sparse_group.tolist()
lS_emb.append(lS_batch)
lS_emb_lengths.append(lS_batch_lengths)
lS_emb_indices.append(lS_batch_indices)
lS.append(lS_emb)
lS_lengths.append(lS_emb_lengths)
lS_indices.append(lS_emb_indices)

return (nbatches, lX, lS, lS_lengths, lS_indices)
return (nbatches, lX, lS_lengths, lS_indices)


# uniform distribution (output data)
Expand Down Expand Up @@ -307,7 +317,6 @@ def generate_synthetic_input_data(

# inputs and targets
lX = []
lS = []
lS_lengths = []
lS_indices = []
for j in range(0, nbatches):
Expand All @@ -317,13 +326,11 @@ def generate_synthetic_input_data(
Xt = ra.rand(n, m_den).astype(np.float32)
lX.append(Xt)
# sparse feature (sparse indices)
lS_emb = []
lS_emb_lengths = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for i, size in enumerate(ln_emb):
lS_batch = []
lS_batch_lengths = []
lS_batch_indices = []
for _ in range(n):
Expand Down Expand Up @@ -369,17 +376,14 @@ def generate_synthetic_input_data(
# reset sparse_group_size in case some index duplicates were removed
sparse_group_size = np.int32(sparse_group.size)
# store lengths and indices
lS_batch.append(sparse_group.tolist())
lS_batch_lengths += [sparse_group_size]
lS_batch_indices += sparse_group.tolist()
lS_emb.append(lS_batch)
lS_emb_lengths.append(lS_batch_lengths)
lS_emb_indices.append(lS_batch_indices)
lS.append(lS_emb)
lS_lengths.append(lS_emb_lengths)
lS_indices.append(lS_emb_indices)

return (nbatches, lX, lS, lS_lengths, lS_indices)
return (nbatches, lX, lS_lengths, lS_indices)


def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
Expand Down Expand Up @@ -467,7 +471,7 @@ def trace_profile(trace, enable_padding=False):
try: # found #
i = rstack.index(r)
# WARNING: I believe below is the correct depth in terms of meaning of the
# algorithm, but that's not what seems to be in the paper alg.
# algorithm, but that is not what seems to be in the paper alg.
# -1 can be subtracted if we defined the distance between
# consecutive accesses (e.g. r, r) as 0 rather than 1.
sd = l - i # - 1
Expand Down
Loading

0 comments on commit 98e2af1

Please sign in to comment.