pytorch/caffe2/python/operator_test/segment_ops_test.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

776 lines
25 KiB
Python
Raw Normal View History

2016-07-21 18:26:41 +00:00
from functools import partial
from hypothesis import given, settings
2016-10-07 20:08:53 +00:00
2016-07-21 18:26:41 +00:00
import numpy as np
import unittest
import hypothesis.strategies as st
from caffe2.python import core, workspace
import caffe2.python.hypothesis_test_util as hu
import caffe2.python.serialized_test.serialized_test_util as serial
def sparse_lengths_sum_ref(D, I, L, normalize_by_lengths=False):
R = np.zeros(shape=(L.size,) + D.shape[1:], dtype=np.float32)
line = 0
for g in range(L.size):
for _ in range(L[g]):
if len(D.shape) > 1:
R[g, :] += D[I[line], :]
else:
R[g] += D[I[line]]
line += 1
if normalize_by_lengths and L[g] > 1:
if len(D.shape) > 1:
R[g, :] = R[g, :] / L[g]
else:
R[g] = R[g] / L[g]
return [R]
def sparse_lengths_mean_ref(D, I, L):
return sparse_lengths_sum_ref(D, I, L, normalize_by_lengths=True)
2016-07-21 18:26:41 +00:00
2016-10-07 20:08:53 +00:00
class TesterBase:
def segment_reduce_op(self, data, segment_ids, reducer, indices=None):
segments = self.split(data, segment_ids, indices)
output = np.zeros((len(segments), ) + data.shape[1:])
for i, segment in enumerate(segments):
if len(segment) > 0:
output[i] = reducer(segment)
else:
output[i] = 0.0
2016-10-07 20:08:53 +00:00
return output
def segment_reduce_grad_op(
self,
data,
segment_ids,
reducer_grad,
grad_out,
output,
indices=None
):
segments = self.split(data, segment_ids, indices)
segment_grads = [
reducer_grad(grad_out[i], [output[i]], [segment])
for i, segment in enumerate(segments)
]
return self.unsplit(data.shape[1:], segment_grads, segment_ids)
def _test(self, prefix, input_strategy, refs, gpu=False, **kwargs):
2016-10-07 20:08:53 +00:00
tester = self
operator_args = kwargs.pop('operator_args', {})
threshold = kwargs.pop('threshold', 1e-4)
grad_check = kwargs.pop('grad_check', True)
2016-10-07 20:08:53 +00:00
@given(X=input_strategy, **hu.gcs)
2016-10-07 20:08:53 +00:00
def test_segment_ops(self, X, gc, dc):
if not gpu and gc.device_type > 0:
return
2016-10-07 20:08:53 +00:00
for op_name, ref, grad_ref in refs:
inputs = ['input%d' % i for i in range(0, len(X))]
op = core.CreateOperator(
prefix + op_name, inputs, ['output'], **operator_args
)
print('Operator %s, ' % op.type, gc.device_type)
2016-10-07 20:08:53 +00:00
def seg_reduce(data, *args):
indices, segments = (
args if len(args) == 2 else (None, args[0])
)
out = tester.segment_reduce_op(
data=data,
segment_ids=segments,
indices=indices,
reducer=ref
)
return (out, )
def seg_reduce_grad(grad_out, outputs, inputs):
data = inputs[0]
args = inputs[1:]
indices, segments = (
args if len(args) == 2 else (None, args[0])
)
# grad r.t. data
grad_val = tester.segment_reduce_grad_op(
data, segments, grad_ref, grad_out, outputs[0], indices
)
# if sparse, include indices along with data gradient
data_grad_slice = (
(grad_val, indices) if indices is not None else grad_val
)
# other inputs don't have gradient
return (data_grad_slice, ) + (None, ) * (len(inputs) - 1)
kwargs = {}
if grad_check:
kwargs['output_to_grad'] = 'output'
kwargs['grad_reference'] = seg_reduce_grad
2016-10-07 20:08:53 +00:00
self.assertReferenceChecks(
device_option=gc,
op=op,
inputs=X,
reference=seg_reduce,
threshold=threshold,
**kwargs
2016-10-07 20:08:53 +00:00
)
return test_segment_ops
class SegmentsTester(TesterBase):
def split(self, data, segment_ids, indices=None):
"""
Given:
data[M1 x M2 x ... x Md]
the input data
indices[N] the index of each entry of segment_ids into data,
where 0 <= index[i] < M1,
with default indices=[0,1,...N]
segment_ids[N] the segment_id for each entry of indices,
returns K outputs, each one containing data entries corresponding
to one of the segments present in `segment_ids`.
"""
if segment_ids.size == 0:
return []
K = max(segment_ids) + 1
outputs = [
np.zeros(
(np.count_nonzero(segment_ids == seg_id), ) + data.shape[1:],
dtype=data.dtype
) for seg_id in range(0, K)
]
counts = np.zeros(K, dtype=int)
2016-10-07 20:08:53 +00:00
for i, seg_id in enumerate(segment_ids):
data_idx = i if indices is None else indices[i]
outputs[seg_id][counts[seg_id]] = data[data_idx]
counts[seg_id] += 1
return outputs
def unsplit(self, extra_shape, inputs, segment_ids):
""" Inverse operation to `split`, with indices=None """
output = np.zeros((len(segment_ids), ) + extra_shape)
if len(segment_ids) == 0:
return output
K = max(segment_ids) + 1
counts = np.zeros(K, dtype=int)
2016-10-07 20:08:53 +00:00
for i, seg_id in enumerate(segment_ids):
output[i] = inputs[seg_id][counts[seg_id]]
counts[seg_id] += 1
return output
class LengthsTester(TesterBase):
def split(self, data, lengths, indices=None):
K = len(lengths)
outputs = [
np.zeros((lengths[seg_id], ) + data.shape[1:],
dtype=data.dtype) for seg_id in range(0, K)
]
start = 0
for i in range(0, K):
for j in range(0, lengths[i]):
data_index = start + j
if indices is not None:
data_index = indices[data_index]
outputs[i][j] = data[data_index]
start += lengths[i]
return outputs
def unsplit(self, extra_shape, inputs, lengths):
N = sum(lengths)
output = np.zeros((N, ) + extra_shape)
K = len(lengths)
assert len(inputs) == K
current = 0
for i in range(0, K):
for j in range(0, lengths[i]):
output[current] = inputs[i][j]
current += 1
return output
2016-07-21 18:26:41 +00:00
def sum_grad(grad_out, outputs, inputs):
return np.repeat(
np.expand_dims(grad_out, axis=0),
inputs[0].shape[0],
2016-10-07 20:08:53 +00:00
axis=0
)
2016-07-21 18:26:41 +00:00
def logsumexp(x):
return np.log(np.sum(np.exp(x), axis=0))
def logsumexp_grad(grad_out, outputs, inputs):
sum_exps = np.sum(np.exp(inputs[0]), axis=0)
return np.repeat(
np.expand_dims(grad_out / sum_exps, 0),
inputs[0].shape[0],
2016-10-07 20:08:53 +00:00
axis=0
) * np.exp(inputs[0])
2016-07-21 18:26:41 +00:00
2016-07-28 22:06:04 +00:00
def logmeanexp(x):
return np.log(np.mean(np.exp(x), axis=0))
2016-07-21 18:26:41 +00:00
def mean(x):
return np.mean(x, axis=0)
def mean_grad(grad_out, outputs, inputs):
return np.repeat(
np.expand_dims(grad_out / inputs[0].shape[0], 0),
inputs[0].shape[0],
2016-10-07 20:08:53 +00:00
axis=0
)
2016-07-21 18:26:41 +00:00
2016-10-07 20:08:53 +00:00
def max_fwd(x):
2016-07-28 22:06:04 +00:00
return np.amax(x, axis=0)
def max_grad(grad_out, outputs, inputs):
flat_inputs = inputs[0].flatten()
flat_outputs = np.array(outputs[0]).flatten()
flat_grad_in = np.zeros(flat_inputs.shape)
flat_grad_out = np.array(grad_out).flatten()
blocks = inputs[0].shape[0]
if blocks == 0:
return np.zeros(inputs[0].shape)
2016-07-28 22:06:04 +00:00
block_size = flat_inputs.shape[0] // blocks
for i in range(block_size):
out_grad = flat_grad_out[i]
out = flat_outputs[i]
for j in range(blocks):
idx = j * block_size + i
# we can produce multiple outputs for max
2016-07-28 22:06:04 +00:00
if out == flat_inputs[idx]:
flat_grad_in[idx] = out_grad
return np.resize(flat_grad_in, inputs[0].shape)
REFERENCES_ALL = [
('Sum', partial(np.sum, axis=0), sum_grad),
('Mean', partial(np.mean, axis=0), mean_grad),
]
2016-07-21 18:26:41 +00:00
REFERENCES_SORTED = [
('RangeSum', partial(np.sum, axis=0), sum_grad),
('RangeLogSumExp', logsumexp, logsumexp_grad),
2016-07-28 22:06:04 +00:00
# gradient is the same as sum
('RangeLogMeanExp', logmeanexp, logsumexp_grad),
2016-07-21 18:26:41 +00:00
('RangeMean', mean, mean_grad),
2016-10-07 20:08:53 +00:00
('RangeMax', max_fwd, max_grad),
2016-07-21 18:26:41 +00:00
]
REFERENCES_LENGTHS_ONLY = [
('Max', partial(np.amax, axis=0), max_grad),
]
2016-07-21 18:26:41 +00:00
def sparse_lengths_weighted_sum_ref(D, W, I, L):
R = np.zeros(shape=(len(L), ) + D.shape[1:], dtype=D.dtype)
line = 0
for g in range(len(L)):
for _ in range(L[g]):
if len(D.shape) > 1:
R[g, :] += W[line] * D[I[line], :]
else:
R[g] += W[line] * D[I[line]]
line += 1
return [R]
def sparse_lengths_weighted_sum_grad_ref(
GO, fwd_out, fwd_in, grad_on_weights=False):
D, W, I, L = fwd_in
GI = np.zeros(shape=(len(I), ) + D.shape[1:], dtype=D.dtype)
GW = np.zeros(shape=W.shape, dtype=W.dtype) if grad_on_weights else None
line = 0
for g in range(len(L)):
for _ in range(L[g]):
if len(GO.shape) > 1:
GI[line, :] = W[line] * GO[g, :]
else:
GI[line] = W[line] * GO[g]
if GW is not None:
if len(GO.shape) > 1:
GW[line] = np.dot(GO[g].flatten(), D[I[line], :].flatten())
else:
GW[line] = np.dot(GO[g].flatten(), D[I[line]].flatten())
line += 1
print(GW)
return [(GI, I), GW, None, None]
2016-07-21 18:26:41 +00:00
class TestSegmentOps(hu.HypothesisTestCase):
def test_sorted_segment_ops(self):
SegmentsTester()._test(
2016-07-21 18:26:41 +00:00
'SortedSegment',
2016-10-07 20:08:53 +00:00
hu.segmented_tensor(
dtype=np.float32,
is_sorted=True,
allow_empty=True
),
REFERENCES_ALL + REFERENCES_SORTED
)(self)
2016-07-21 18:26:41 +00:00
def test_unsorted_segment_ops(self):
SegmentsTester()._test(
2016-07-21 18:26:41 +00:00
'UnsortedSegment',
2016-10-07 20:08:53 +00:00
hu.segmented_tensor(
dtype=np.float32,
is_sorted=False,
allow_empty=True
),
REFERENCES_ALL,
)(self)
def test_unsorted_segment_ops_gpu(self):
SegmentsTester()._test(
'UnsortedSegment',
hu.segmented_tensor(
dtype=np.float32,
is_sorted=False,
allow_empty=True,
),
REFERENCES_ALL,
gpu=workspace.has_gpu_support,
grad_check=False,
2016-10-07 20:08:53 +00:00
)(self)
2016-07-21 18:26:41 +00:00
def test_sparse_sorted_segment_ops(self):
SegmentsTester()._test(
2016-07-21 18:26:41 +00:00
'SparseSortedSegment',
2016-10-07 20:08:53 +00:00
hu.sparse_segmented_tensor(
dtype=np.float32,
is_sorted=True,
allow_empty=True
),
REFERENCES_ALL
)(self)
2016-07-21 18:26:41 +00:00
def test_sparse_unsorted_segment_ops(self):
SegmentsTester()._test(
2016-07-21 18:26:41 +00:00
'SparseUnsortedSegment',
2016-10-07 20:08:53 +00:00
hu.sparse_segmented_tensor(
dtype=np.float32,
is_sorted=False,
allow_empty=True
),
REFERENCES_ALL
)(self)
def test_lengths_ops(self):
LengthsTester()._test(
2016-10-07 20:08:53 +00:00
'Lengths',
hu.lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
2016-10-07 20:08:53 +00:00
allow_empty=True
),
REFERENCES_ALL + REFERENCES_LENGTHS_ONLY,
2016-10-07 20:08:53 +00:00
)(self)
def test_sparse_lengths_ops(self):
for itype in [np.int32, np.int64]:
LengthsTester()._test(
'SparseLengths',
hu.sparse_lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
allow_empty=True,
itype=itype,
),
REFERENCES_ALL,
)(self)
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
@given(**hu.gcs)
def test_unsorted_sums_large(self, gc, dc):
X = np.random.rand(10000, 32, 12).astype(np.float32)
segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
op = core.CreateOperator("UnsortedSegmentSum", ["X", "segments"], "out")
self.assertDeviceChecks(dc, op, [X, segments], [0])
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
@given(**hu.gcs)
def test_sorted_segment_range_mean(self, gc, dc):
X = np.random.rand(6, 32, 12).astype(np.float32)
segments = np.array([0, 0, 1, 1, 2, 3]).astype(np.int32)
op = core.CreateOperator(
"SortedSegmentRangeMean",
["X", "segments"],
"out"
)
self.assertDeviceChecks(dc, op, [X, segments], [0])
self.assertGradientChecks(gc, op, [X, segments], 0, [0])
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
@given(**hu.gcs)
def test_sorted_segment_range_log_mean_exp(self, gc, dc):
X = np.random.rand(7, 32, 12).astype(np.float32)
segments = np.array([0, 0, 1, 1, 2, 2, 3]).astype(np.int32)
op = core.CreateOperator(
"SortedSegmentRangeLogMeanExp",
["X", "segments"],
"out"
)
self.assertDeviceChecks(dc, op, [X, segments], [0])
self.assertGradientChecks(gc, op, [X, segments], 0, [0])
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
@given(**hu.gcs)
def test_unsorted_means_large(self, gc, dc):
X = np.random.rand(10000, 31, 19).astype(np.float32)
segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
op = core.CreateOperator("UnsortedSegmentMean", ["X", "segments"], "out")
self.assertDeviceChecks(dc, op, [X, segments], [0])
@serial.given(
inputs=hu.lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
allow_empty=True,
),
**hu.gcs
)
def test_lengths_sum(self, inputs, gc, dc):
X, Y = inputs
op = core.CreateOperator("LengthsSum", ["X", "Y"], "out")
[Caffe2] Changes done inside Facebook (#6378) * fix unit test for sqrt op From the error logging: [idx, grad, grad_estimate] are: [[ 146. 0.5 0.45776367] [ 147. 0.5 0.45776367] The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; ) The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss) This diff - increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :) - also clean up, and merge the test case for inplace Vs. non-inplace Tested with: `CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"` * CompositeReader & CompositeReaderBuilder A new type of reader gluing multiple readers together. * Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid" Original commit changeset: 9325a4356dbe * [dai][WIP] convert params to int8 on ps before sending to trainer Add float->uint8 conversion in addition to float->fp16 conversion in model_saver. * [easy] improve unit test for sparse length sum ops as desc. #accept2ship * Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24 * move sparse hash unique ops to OOS and add unit tests - move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1 - The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2 - fix the CUDA UniqueOp for the case when batch is empty. - add unit test * group_norm_op for caffe2 This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494 This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel). * Resubmit D7405233: disappeared in D7464958 OOS publish causes the op missing -- however, test was still there * [c2] add sparse hash engine for cuda unique op The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU. * [dper][gpu] enable unit testing gpu trainer for sparse nn to debug the GPU trainer using mock data in unit test. make it easier to develop GPU trainer for new models. * Reuse Gloo context for Synchronize() calls Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts). * [GanH/WGAN][1/n]: add FC param clipping as titled * [mobile] minimizing changes between caffe2_benchmark and speed_benchmark * [GanH]: enable diagnose within model avoid finding blob names but to directly enable inside the model * Add `net_transformer_fun` option to DPM This callback allows for various transformations to be made to the model after gradient operators have been added. The immediate motivation for this is to allow transformations such has "checkpoint-and-recompute" which allow trading off memory for additional compute. Adding several callbacks like this has made DPM's API less than ideal at this stage. However, I could not find any reasonable alternative. * [DT] [33/n] Compile flow task groups task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary. * Initial commit for sparse_normalize vectorization and benchmark * [GanH]: LB Calibration for JSD as titled * Tracing event in async executor Adding event tracing through TRACE_EVENT macro in async executor * [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset D7409751 got lost in D7464958 * Visualizing realtime weights values we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index. Currently, we assume the blob to be 2 dimensional. * [GanH][Easy]: Fix Homotopy Weighting apparantely, there was a bug in homotopy weight (alpha, beta) update * [c2] move sparse hash unique op out of oss so that oss do not need to depend on google hash map. * Get rid of std::round as it's not supported on Android * Revert changes on setup.py * Skip shaky test on Dataio * fix
2018-04-11 04:11:43 +00:00
def ref(D, L):
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
line = 0
for g in range(L.size):
for _ in range(L[g]):
if len(D.shape) > 1:
R[g, :] += D[line, :]
else:
R[g] += D[line]
line += 1
return [R]
self.assertReferenceChecks(gc, op, [X, Y], ref)
self.assertDeviceChecks(dc, op, [X, Y], [0])
self.assertGradientChecks(gc, op, [X, Y], 0, [0])
@serial.given(
inputs=hu.sparse_lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
allow_empty=True
),
**hu.gcs
)
def test_sparse_lengths_sum(self, inputs, gc, dc):
X, Y, Z = inputs
op = core.CreateOperator("SparseLengthsSum", ["X", "Y", "Z"], "out")
[Caffe2] Changes done inside Facebook (#6378) * fix unit test for sqrt op From the error logging: [idx, grad, grad_estimate] are: [[ 146. 0.5 0.45776367] [ 147. 0.5 0.45776367] The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; ) The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss) This diff - increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :) - also clean up, and merge the test case for inplace Vs. non-inplace Tested with: `CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"` * CompositeReader & CompositeReaderBuilder A new type of reader gluing multiple readers together. * Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid" Original commit changeset: 9325a4356dbe * [dai][WIP] convert params to int8 on ps before sending to trainer Add float->uint8 conversion in addition to float->fp16 conversion in model_saver. * [easy] improve unit test for sparse length sum ops as desc. #accept2ship * Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24 * move sparse hash unique ops to OOS and add unit tests - move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1 - The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2 - fix the CUDA UniqueOp for the case when batch is empty. - add unit test * group_norm_op for caffe2 This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494 This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel). * Resubmit D7405233: disappeared in D7464958 OOS publish causes the op missing -- however, test was still there * [c2] add sparse hash engine for cuda unique op The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU. * [dper][gpu] enable unit testing gpu trainer for sparse nn to debug the GPU trainer using mock data in unit test. make it easier to develop GPU trainer for new models. * Reuse Gloo context for Synchronize() calls Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts). * [GanH/WGAN][1/n]: add FC param clipping as titled * [mobile] minimizing changes between caffe2_benchmark and speed_benchmark * [GanH]: enable diagnose within model avoid finding blob names but to directly enable inside the model * Add `net_transformer_fun` option to DPM This callback allows for various transformations to be made to the model after gradient operators have been added. The immediate motivation for this is to allow transformations such has "checkpoint-and-recompute" which allow trading off memory for additional compute. Adding several callbacks like this has made DPM's API less than ideal at this stage. However, I could not find any reasonable alternative. * [DT] [33/n] Compile flow task groups task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary. * Initial commit for sparse_normalize vectorization and benchmark * [GanH]: LB Calibration for JSD as titled * Tracing event in async executor Adding event tracing through TRACE_EVENT macro in async executor * [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset D7409751 got lost in D7464958 * Visualizing realtime weights values we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index. Currently, we assume the blob to be 2 dimensional. * [GanH][Easy]: Fix Homotopy Weighting apparantely, there was a bug in homotopy weight (alpha, beta) update * [c2] move sparse hash unique op out of oss so that oss do not need to depend on google hash map. * Get rid of std::round as it's not supported on Android * Revert changes on setup.py * Skip shaky test on Dataio * fix
2018-04-11 04:11:43 +00:00
def ref(D, I, L):
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
line = 0
for g in range(L.size):
for _ in range(L[g]):
if len(D.shape) > 1:
R[g, :] += D[I[line], :]
else:
R[g] += D[I[line]]
line += 1
return [R]
self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
@serial.given(
Update from facebook (#7696) * Fix handling of empty batches in SumReduceDimsOp As titled * Deferrable async_scheduling finishRun fix Proper order of finishing run operations in deferrable_async_scheduling net * Simplify exception handling in async_scheduling Simplify exception handling, no need to busy wait, thread that processes the last task can finish the run * [C2]worker_coordinator_memorize_worker_ids As titled. This is related to T28689868, where the number of blobs we want to create is equal to the number of worker ids * Add unit test for nets with no type set * Ignore total length argument in sympolic_pad_packed_sequence 1- There was a mistake in the code that total_length was added to the wrong symbolic function (pack_padded_sequence) instead of (pad_packed_sequence) 2- No need to throw an exception if total_length is given since it is only used to enable data_parallel training on multi-gpus and doesn't have anything to do with onnx export, so just ignore it. https://fburl.com/tk4gciqp * Add support for MKLDNN to async_scheduling Just add MKLDNN as a possible CPU option to async_scheduling's pool function * [AuFL][ensemble] support branch output for prediction This diff supports using predictions from different branches and thus enables model ensembling (not fully independent). * Fix a bug in add_loss in layer_model_helper As titled. * Support lradaption for adam 1.lr adaption operator 2.apply to dense adam * Perf tweaks for async_scheduling Restore single pool option + remove unnecessary (no-ops) calls * add quantization to SparseSimdAdagradOp add a bunch of quantization signatures to SparseSimdAdagradOp, implementations to come next * [sr] [codemod] Change all SR callsites to use new API @allow-large-files This diff refactors all callsites of SR to use the slightly changed API introduced in the diff below. Really what this means is that you need to include the correct header. Also if you were using `ClientFactory::newFactory` you need to not prefix it with `ClientFactory::`. ``` cd ~/fbsource/fbcode find ./ -type f -exec sed -i -e 's:#include "servicerouter/client/cpp2/ClientFactory.h":#include "servicerouter/client/cpp2/ServiceRouter.h":' -e 's:#include <servicerouter/client/cpp2/ClientFactory.h>:#include <servicerouter/client/cpp2/ServiceRouter.h>:' -e 's/ClientFactory::newFactory(/newFactory(/g' {} \; ``` Also manually fixed spots that couldn't be done automatically (or broke because they depended on transitive includes). * Back out "Fix handling of empty batches in SumReduceDimsOp" Original commit changeset: 282da1730cc2 This commit is blocking the Github->fbcode sync, which really needs to get merged ASAP. D7881937 which this diff depends on will be reverted in the sync D7990948 which causes this to break. The sync diff cannot be patched with this reversion because it must be landed against base revision 5c8c099 , and D7881937 must not be included in the sync diff because it is breaking GPU tests that are not available in sandcastle : https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-cuda8.0-cudnn6-ubuntu16.04-test/3638/console for one example. * Add the flow to support operator benchmark 1) generate model with the operator 2) upload to everstore 3) generate model spec into json file 4) start running the benchmark * [tum][gpu] Connect DPM trainer with flow and unit tests This diff: - Fix some small bugs for Yiming's recent changes to parallelizer, so it suits real use cases. - Add correct tags to the TUM code, so we can do data parallel transform - pass extra info when instantiation. - add unit test for using DPM in TUM model After this diff, we can do simple box, multi-gpu fully-sync trainer for TUM in Fblearner workflow, but may still need to do speed benchmarking. * w/o normalized lradaption for adam dense only The previous lr adaption includes a normalization step when performing the dot product operation. This is not exactly same as what is proposed in the paper. I add normalization as an option. Without it, the operator performs exactly what the paper proposed. With the option, we add the normalization step * [fb] Use SharedPromise in DeferrableAsyncSchedulingNet This code is to simplify DeferrableAsyncSchedulingNet by removing condition variable + small fixes * [tum] implement cuda sparseLengthsMean and LengthsMean as title * Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function. Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function. * Move feature_to_index to FeatureSpec.feature_to_index move feature_to_index to FeatureSpec.feature_to_index to avoid override other fields * [Caffe2] Rename bytes_moved to bytes_written Just a rename in preparation for supporting bytes_read. * [c2] fix ReduceFrontSumOp for empty case by setting 0 otherwise, it may use the results from last iteration when it's empty batch. * [Caffe2] [Int8] Improve Intel CPU performance * [Easy] Improve PrependDim op logging as titled * DBFileReader expand db_path using os.path.expanduser(..) Since there are a lot of possible use cases of `DBFileReader` to read from user home path, like `~/local/sample.db`, I want to save people's trouble of calling `os.path.expanduser(db_path)` themselves. * [Caffe2] Add bytes_read to cost structure We're adding analytical read bytes to cost functions. This extends the structure accordingly for all CostInference defined operators. Additionally, some small bug fixes were performed: 1) Cost functions now extract type information of operands instead of assuming float * Fix sleef on aarch64 for hhvm @bypass-lint Rename flag * Remove duplicated part in caffe2/ideep/operators/conv_op.cc should be sync error * Rename test helper function test_adagrad_sparse_helper to adagrad_sparse_test_helper to avoid confusing pytest
2018-05-20 06:10:48 +00:00
inputs=hu.lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
allow_empty=True,
),
**hu.gcs
)
def test_lengths_mean(self, inputs, gc, dc):
X, Y = inputs
op = core.CreateOperator("LengthsMean", ["X", "Y"], "out")
def ref(D, L):
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
line = 0
for g in range(L.size):
for _ in range(L[g]):
if len(D.shape) > 1:
R[g, :] += D[line, :]
else:
R[g] += D[line]
line += 1
if L[g] > 1:
if len(D.shape) > 1:
R[g, :] = R[g, :] / L[g]
else:
R[g] = R[g] / L[g]
return [R]
self.assertReferenceChecks(gc, op, [X, Y], ref)
self.assertDeviceChecks(dc, op, [X, Y], [0])
self.assertGradientChecks(gc, op, [X, Y], 0, [0])
@serial.given(
Update from facebook (#7696) * Fix handling of empty batches in SumReduceDimsOp As titled * Deferrable async_scheduling finishRun fix Proper order of finishing run operations in deferrable_async_scheduling net * Simplify exception handling in async_scheduling Simplify exception handling, no need to busy wait, thread that processes the last task can finish the run * [C2]worker_coordinator_memorize_worker_ids As titled. This is related to T28689868, where the number of blobs we want to create is equal to the number of worker ids * Add unit test for nets with no type set * Ignore total length argument in sympolic_pad_packed_sequence 1- There was a mistake in the code that total_length was added to the wrong symbolic function (pack_padded_sequence) instead of (pad_packed_sequence) 2- No need to throw an exception if total_length is given since it is only used to enable data_parallel training on multi-gpus and doesn't have anything to do with onnx export, so just ignore it. https://fburl.com/tk4gciqp * Add support for MKLDNN to async_scheduling Just add MKLDNN as a possible CPU option to async_scheduling's pool function * [AuFL][ensemble] support branch output for prediction This diff supports using predictions from different branches and thus enables model ensembling (not fully independent). * Fix a bug in add_loss in layer_model_helper As titled. * Support lradaption for adam 1.lr adaption operator 2.apply to dense adam * Perf tweaks for async_scheduling Restore single pool option + remove unnecessary (no-ops) calls * add quantization to SparseSimdAdagradOp add a bunch of quantization signatures to SparseSimdAdagradOp, implementations to come next * [sr] [codemod] Change all SR callsites to use new API @allow-large-files This diff refactors all callsites of SR to use the slightly changed API introduced in the diff below. Really what this means is that you need to include the correct header. Also if you were using `ClientFactory::newFactory` you need to not prefix it with `ClientFactory::`. ``` cd ~/fbsource/fbcode find ./ -type f -exec sed -i -e 's:#include "servicerouter/client/cpp2/ClientFactory.h":#include "servicerouter/client/cpp2/ServiceRouter.h":' -e 's:#include <servicerouter/client/cpp2/ClientFactory.h>:#include <servicerouter/client/cpp2/ServiceRouter.h>:' -e 's/ClientFactory::newFactory(/newFactory(/g' {} \; ``` Also manually fixed spots that couldn't be done automatically (or broke because they depended on transitive includes). * Back out "Fix handling of empty batches in SumReduceDimsOp" Original commit changeset: 282da1730cc2 This commit is blocking the Github->fbcode sync, which really needs to get merged ASAP. D7881937 which this diff depends on will be reverted in the sync D7990948 which causes this to break. The sync diff cannot be patched with this reversion because it must be landed against base revision 5c8c099 , and D7881937 must not be included in the sync diff because it is breaking GPU tests that are not available in sandcastle : https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-cuda8.0-cudnn6-ubuntu16.04-test/3638/console for one example. * Add the flow to support operator benchmark 1) generate model with the operator 2) upload to everstore 3) generate model spec into json file 4) start running the benchmark * [tum][gpu] Connect DPM trainer with flow and unit tests This diff: - Fix some small bugs for Yiming's recent changes to parallelizer, so it suits real use cases. - Add correct tags to the TUM code, so we can do data parallel transform - pass extra info when instantiation. - add unit test for using DPM in TUM model After this diff, we can do simple box, multi-gpu fully-sync trainer for TUM in Fblearner workflow, but may still need to do speed benchmarking. * w/o normalized lradaption for adam dense only The previous lr adaption includes a normalization step when performing the dot product operation. This is not exactly same as what is proposed in the paper. I add normalization as an option. Without it, the operator performs exactly what the paper proposed. With the option, we add the normalization step * [fb] Use SharedPromise in DeferrableAsyncSchedulingNet This code is to simplify DeferrableAsyncSchedulingNet by removing condition variable + small fixes * [tum] implement cuda sparseLengthsMean and LengthsMean as title * Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function. Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function. * Move feature_to_index to FeatureSpec.feature_to_index move feature_to_index to FeatureSpec.feature_to_index to avoid override other fields * [Caffe2] Rename bytes_moved to bytes_written Just a rename in preparation for supporting bytes_read. * [c2] fix ReduceFrontSumOp for empty case by setting 0 otherwise, it may use the results from last iteration when it's empty batch. * [Caffe2] [Int8] Improve Intel CPU performance * [Easy] Improve PrependDim op logging as titled * DBFileReader expand db_path using os.path.expanduser(..) Since there are a lot of possible use cases of `DBFileReader` to read from user home path, like `~/local/sample.db`, I want to save people's trouble of calling `os.path.expanduser(db_path)` themselves. * [Caffe2] Add bytes_read to cost structure We're adding analytical read bytes to cost functions. This extends the structure accordingly for all CostInference defined operators. Additionally, some small bug fixes were performed: 1) Cost functions now extract type information of operands instead of assuming float * Fix sleef on aarch64 for hhvm @bypass-lint Rename flag * Remove duplicated part in caffe2/ideep/operators/conv_op.cc should be sync error * Rename test helper function test_adagrad_sparse_helper to adagrad_sparse_test_helper to avoid confusing pytest
2018-05-20 06:10:48 +00:00
inputs=hu.sparse_lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
allow_empty=True
),
**hu.gcs
)
def test_sparse_lengths_mean(self, inputs, gc, dc):
X, Y, Z = inputs
op = core.CreateOperator("SparseLengthsMean", ["X", "Y", "Z"], "out")
def ref(D, I, L):
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
line = 0
for g in range(L.size):
for _ in range(L[g]):
if len(D.shape) > 1:
R[g, :] += D[I[line], :]
else:
R[g] += D[I[line]]
line += 1
if L[g] > 1:
if len(D.shape) > 1:
R[g, :] = R[g, :] / L[g]
else:
R[g] = R[g] / L[g]
return [R]
self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
@serial.given(
grad_on_weights=st.booleans(),
inputs=hu.sparse_lengths_tensor(
dtype=np.float32,
min_value=1,
max_value=5,
allow_empty=True
),
[Caffe2] Changes done inside Facebook (#6378) * fix unit test for sqrt op From the error logging: [idx, grad, grad_estimate] are: [[ 146. 0.5 0.45776367] [ 147. 0.5 0.45776367] The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; ) The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss) This diff - increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :) - also clean up, and merge the test case for inplace Vs. non-inplace Tested with: `CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"` * CompositeReader & CompositeReaderBuilder A new type of reader gluing multiple readers together. * Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid" Original commit changeset: 9325a4356dbe * [dai][WIP] convert params to int8 on ps before sending to trainer Add float->uint8 conversion in addition to float->fp16 conversion in model_saver. * [easy] improve unit test for sparse length sum ops as desc. #accept2ship * Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24 * move sparse hash unique ops to OOS and add unit tests - move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1 - The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2 - fix the CUDA UniqueOp for the case when batch is empty. - add unit test * group_norm_op for caffe2 This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494 This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel). * Resubmit D7405233: disappeared in D7464958 OOS publish causes the op missing -- however, test was still there * [c2] add sparse hash engine for cuda unique op The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU. * [dper][gpu] enable unit testing gpu trainer for sparse nn to debug the GPU trainer using mock data in unit test. make it easier to develop GPU trainer for new models. * Reuse Gloo context for Synchronize() calls Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts). * [GanH/WGAN][1/n]: add FC param clipping as titled * [mobile] minimizing changes between caffe2_benchmark and speed_benchmark * [GanH]: enable diagnose within model avoid finding blob names but to directly enable inside the model * Add `net_transformer_fun` option to DPM This callback allows for various transformations to be made to the model after gradient operators have been added. The immediate motivation for this is to allow transformations such has "checkpoint-and-recompute" which allow trading off memory for additional compute. Adding several callbacks like this has made DPM's API less than ideal at this stage. However, I could not find any reasonable alternative. * [DT] [33/n] Compile flow task groups task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary. * Initial commit for sparse_normalize vectorization and benchmark * [GanH]: LB Calibration for JSD as titled * Tracing event in async executor Adding event tracing through TRACE_EVENT macro in async executor * [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset D7409751 got lost in D7464958 * Visualizing realtime weights values we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index. Currently, we assume the blob to be 2 dimensional. * [GanH][Easy]: Fix Homotopy Weighting apparantely, there was a bug in homotopy weight (alpha, beta) update * [c2] move sparse hash unique op out of oss so that oss do not need to depend on google hash map. * Get rid of std::round as it's not supported on Android * Revert changes on setup.py * Skip shaky test on Dataio * fix
2018-04-11 04:11:43 +00:00
seed=st.integers(min_value=0, max_value=100),
**hu.gcs
)
def test_sparse_lengths_weighted_sum(
[Caffe2] Changes done inside Facebook (#6378) * fix unit test for sqrt op From the error logging: [idx, grad, grad_estimate] are: [[ 146. 0.5 0.45776367] [ 147. 0.5 0.45776367] The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; ) The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss) This diff - increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :) - also clean up, and merge the test case for inplace Vs. non-inplace Tested with: `CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"` * CompositeReader & CompositeReaderBuilder A new type of reader gluing multiple readers together. * Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid" Original commit changeset: 9325a4356dbe * [dai][WIP] convert params to int8 on ps before sending to trainer Add float->uint8 conversion in addition to float->fp16 conversion in model_saver. * [easy] improve unit test for sparse length sum ops as desc. #accept2ship * Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24 * move sparse hash unique ops to OOS and add unit tests - move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1 - The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2 - fix the CUDA UniqueOp for the case when batch is empty. - add unit test * group_norm_op for caffe2 This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494 This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel). * Resubmit D7405233: disappeared in D7464958 OOS publish causes the op missing -- however, test was still there * [c2] add sparse hash engine for cuda unique op The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU. * [dper][gpu] enable unit testing gpu trainer for sparse nn to debug the GPU trainer using mock data in unit test. make it easier to develop GPU trainer for new models. * Reuse Gloo context for Synchronize() calls Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts). * [GanH/WGAN][1/n]: add FC param clipping as titled * [mobile] minimizing changes between caffe2_benchmark and speed_benchmark * [GanH]: enable diagnose within model avoid finding blob names but to directly enable inside the model * Add `net_transformer_fun` option to DPM This callback allows for various transformations to be made to the model after gradient operators have been added. The immediate motivation for this is to allow transformations such has "checkpoint-and-recompute" which allow trading off memory for additional compute. Adding several callbacks like this has made DPM's API less than ideal at this stage. However, I could not find any reasonable alternative. * [DT] [33/n] Compile flow task groups task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary. * Initial commit for sparse_normalize vectorization and benchmark * [GanH]: LB Calibration for JSD as titled * Tracing event in async executor Adding event tracing through TRACE_EVENT macro in async executor * [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset D7409751 got lost in D7464958 * Visualizing realtime weights values we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index. Currently, we assume the blob to be 2 dimensional. * [GanH][Easy]: Fix Homotopy Weighting apparantely, there was a bug in homotopy weight (alpha, beta) update * [c2] move sparse hash unique op out of oss so that oss do not need to depend on google hash map. * Get rid of std::round as it's not supported on Android * Revert changes on setup.py * Skip shaky test on Dataio * fix
2018-04-11 04:11:43 +00:00
self, grad_on_weights, inputs, seed, gc, dc):
D, I, L = inputs
[Caffe2] Changes done inside Facebook (#6378) * fix unit test for sqrt op From the error logging: [idx, grad, grad_estimate] are: [[ 146. 0.5 0.45776367] [ 147. 0.5 0.45776367] The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; ) The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss) This diff - increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :) - also clean up, and merge the test case for inplace Vs. non-inplace Tested with: `CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"` * CompositeReader & CompositeReaderBuilder A new type of reader gluing multiple readers together. * Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid" Original commit changeset: 9325a4356dbe * [dai][WIP] convert params to int8 on ps before sending to trainer Add float->uint8 conversion in addition to float->fp16 conversion in model_saver. * [easy] improve unit test for sparse length sum ops as desc. #accept2ship * Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24 * move sparse hash unique ops to OOS and add unit tests - move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1 - The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2 - fix the CUDA UniqueOp for the case when batch is empty. - add unit test * group_norm_op for caffe2 This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494 This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel). * Resubmit D7405233: disappeared in D7464958 OOS publish causes the op missing -- however, test was still there * [c2] add sparse hash engine for cuda unique op The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU. * [dper][gpu] enable unit testing gpu trainer for sparse nn to debug the GPU trainer using mock data in unit test. make it easier to develop GPU trainer for new models. * Reuse Gloo context for Synchronize() calls Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts). * [GanH/WGAN][1/n]: add FC param clipping as titled * [mobile] minimizing changes between caffe2_benchmark and speed_benchmark * [GanH]: enable diagnose within model avoid finding blob names but to directly enable inside the model * Add `net_transformer_fun` option to DPM This callback allows for various transformations to be made to the model after gradient operators have been added. The immediate motivation for this is to allow transformations such has "checkpoint-and-recompute" which allow trading off memory for additional compute. Adding several callbacks like this has made DPM's API less than ideal at this stage. However, I could not find any reasonable alternative. * [DT] [33/n] Compile flow task groups task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary. * Initial commit for sparse_normalize vectorization and benchmark * [GanH]: LB Calibration for JSD as titled * Tracing event in async executor Adding event tracing through TRACE_EVENT macro in async executor * [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset D7409751 got lost in D7464958 * Visualizing realtime weights values we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index. Currently, we assume the blob to be 2 dimensional. * [GanH][Easy]: Fix Homotopy Weighting apparantely, there was a bug in homotopy weight (alpha, beta) update * [c2] move sparse hash unique op out of oss so that oss do not need to depend on google hash map. * Get rid of std::round as it's not supported on Android * Revert changes on setup.py * Skip shaky test on Dataio * fix
2018-04-11 04:11:43 +00:00
np.random.seed(int(seed))
W = np.random.rand(I.size).astype(np.float32)
op = core.CreateOperator(
"SparseLengthsWeightedSum",
["D", "W", "I", "L"],
"out",
grad_on_weights=grad_on_weights)
self.assertDeviceChecks(dc, op, [D, W, I, L], [0])
self.assertReferenceChecks(
device_option=gc,
op=op,
inputs=[D, W, I, L],
reference=sparse_lengths_weighted_sum_ref,
threshold=1e-4,
output_to_grad='out',
grad_reference=partial(
sparse_lengths_weighted_sum_grad_ref,
grad_on_weights=grad_on_weights),
)
self.assertGradientChecks(gc, op, [D, W, I, L], 0, [0])
if grad_on_weights:
self.assertGradientChecks(gc, op, [D, W, I, L], 1, [0])
@given(**hu.gcs)
def test_sparse_lengths_indices_in_gradient_sum_gpu(self, gc, dc):
X = np.random.rand(3, 3, 4, 5).astype(np.float32)
Y = np.asarray([3, 3, 2]).astype(np.int32)
Z = np.random.randint(0, 50, size=8).astype(np.int64)
op = core.CreateOperator(
"SparseLengthsIndicesInGradientSumGradient", ["X", "Y", "Z"], "out"
)
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
Update from facebook (#7696) * Fix handling of empty batches in SumReduceDimsOp As titled * Deferrable async_scheduling finishRun fix Proper order of finishing run operations in deferrable_async_scheduling net * Simplify exception handling in async_scheduling Simplify exception handling, no need to busy wait, thread that processes the last task can finish the run * [C2]worker_coordinator_memorize_worker_ids As titled. This is related to T28689868, where the number of blobs we want to create is equal to the number of worker ids * Add unit test for nets with no type set * Ignore total length argument in sympolic_pad_packed_sequence 1- There was a mistake in the code that total_length was added to the wrong symbolic function (pack_padded_sequence) instead of (pad_packed_sequence) 2- No need to throw an exception if total_length is given since it is only used to enable data_parallel training on multi-gpus and doesn't have anything to do with onnx export, so just ignore it. https://fburl.com/tk4gciqp * Add support for MKLDNN to async_scheduling Just add MKLDNN as a possible CPU option to async_scheduling's pool function * [AuFL][ensemble] support branch output for prediction This diff supports using predictions from different branches and thus enables model ensembling (not fully independent). * Fix a bug in add_loss in layer_model_helper As titled. * Support lradaption for adam 1.lr adaption operator 2.apply to dense adam * Perf tweaks for async_scheduling Restore single pool option + remove unnecessary (no-ops) calls * add quantization to SparseSimdAdagradOp add a bunch of quantization signatures to SparseSimdAdagradOp, implementations to come next * [sr] [codemod] Change all SR callsites to use new API @allow-large-files This diff refactors all callsites of SR to use the slightly changed API introduced in the diff below. Really what this means is that you need to include the correct header. Also if you were using `ClientFactory::newFactory` you need to not prefix it with `ClientFactory::`. ``` cd ~/fbsource/fbcode find ./ -type f -exec sed -i -e 's:#include "servicerouter/client/cpp2/ClientFactory.h":#include "servicerouter/client/cpp2/ServiceRouter.h":' -e 's:#include <servicerouter/client/cpp2/ClientFactory.h>:#include <servicerouter/client/cpp2/ServiceRouter.h>:' -e 's/ClientFactory::newFactory(/newFactory(/g' {} \; ``` Also manually fixed spots that couldn't be done automatically (or broke because they depended on transitive includes). * Back out "Fix handling of empty batches in SumReduceDimsOp" Original commit changeset: 282da1730cc2 This commit is blocking the Github->fbcode sync, which really needs to get merged ASAP. D7881937 which this diff depends on will be reverted in the sync D7990948 which causes this to break. The sync diff cannot be patched with this reversion because it must be landed against base revision 5c8c099 , and D7881937 must not be included in the sync diff because it is breaking GPU tests that are not available in sandcastle : https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-cuda8.0-cudnn6-ubuntu16.04-test/3638/console for one example. * Add the flow to support operator benchmark 1) generate model with the operator 2) upload to everstore 3) generate model spec into json file 4) start running the benchmark * [tum][gpu] Connect DPM trainer with flow and unit tests This diff: - Fix some small bugs for Yiming's recent changes to parallelizer, so it suits real use cases. - Add correct tags to the TUM code, so we can do data parallel transform - pass extra info when instantiation. - add unit test for using DPM in TUM model After this diff, we can do simple box, multi-gpu fully-sync trainer for TUM in Fblearner workflow, but may still need to do speed benchmarking. * w/o normalized lradaption for adam dense only The previous lr adaption includes a normalization step when performing the dot product operation. This is not exactly same as what is proposed in the paper. I add normalization as an option. Without it, the operator performs exactly what the paper proposed. With the option, we add the normalization step * [fb] Use SharedPromise in DeferrableAsyncSchedulingNet This code is to simplify DeferrableAsyncSchedulingNet by removing condition variable + small fixes * [tum] implement cuda sparseLengthsMean and LengthsMean as title * Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function. Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function. * Move feature_to_index to FeatureSpec.feature_to_index move feature_to_index to FeatureSpec.feature_to_index to avoid override other fields * [Caffe2] Rename bytes_moved to bytes_written Just a rename in preparation for supporting bytes_read. * [c2] fix ReduceFrontSumOp for empty case by setting 0 otherwise, it may use the results from last iteration when it's empty batch. * [Caffe2] [Int8] Improve Intel CPU performance * [Easy] Improve PrependDim op logging as titled * DBFileReader expand db_path using os.path.expanduser(..) Since there are a lot of possible use cases of `DBFileReader` to read from user home path, like `~/local/sample.db`, I want to save people's trouble of calling `os.path.expanduser(db_path)` themselves. * [Caffe2] Add bytes_read to cost structure We're adding analytical read bytes to cost functions. This extends the structure accordingly for all CostInference defined operators. Additionally, some small bug fixes were performed: 1) Cost functions now extract type information of operands instead of assuming float * Fix sleef on aarch64 for hhvm @bypass-lint Rename flag * Remove duplicated part in caffe2/ideep/operators/conv_op.cc should be sync error * Rename test helper function test_adagrad_sparse_helper to adagrad_sparse_test_helper to avoid confusing pytest
2018-05-20 06:10:48 +00:00
@given(**hu.gcs)
def test_sparse_lengths_indices_in_gradient_mean_gpu(self, gc, dc):
X = np.random.rand(3, 3, 4, 5).astype(np.float32)
Y = np.asarray([3, 3, 2]).astype(np.int32)
Z = np.random.randint(0, 50, size=8).astype(np.int64)
op = core.CreateOperator(
"SparseLengthsIndicesInGradientMeanGradient", ["X", "Y", "Z"], "out"
)
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
@given(**hu.gcs_cpu_only)
def test_legacy_sparse_and_lengths_sum_gradient(self, gc, dc):
X = np.random.rand(3, 64).astype(np.float32)
Y = np.asarray([20, 20, 10]).astype(np.int32)
workspace.FeedBlob("X", X)
workspace.FeedBlob("Y", Y)
test_net = core.Net("test_net")
test_net.SparseLengthsSumGradient(["X", "Y"], "out1")
test_net.LengthsSumGradient(["X", "Y"], "out2")
workspace.RunNetOnce(test_net)
out1 = workspace.FetchBlob("out1")
out2 = workspace.FetchBlob("out2")
self.assertTrue((out1 == out2).all())
@given(**hu.gcs)
@settings(deadline=10000)
def test_sparse_lengths_sum_invalid_index(self, gc, dc):
D = np.random.rand(50, 3, 4, 5).astype(np.float32)
I = (np.random.randint(0, 10000, size=10) + 10000).astype(np.int64)
L = np.asarray([4, 4, 2]).astype(np.int32)
op = core.CreateOperator(
"SparseLengthsSum",
["D", "I", "L"],
"out")
workspace.FeedBlob('D', D)
workspace.FeedBlob('I', I)
workspace.FeedBlob('L', L)
with self.assertRaises(RuntimeError):
workspace.RunOperatorOnce(op)
@serial.given(**hu.gcs_cpu_only)
def test_sparse_lengths_positional_weighted_sum(
self, gc, dc):
D = np.random.rand(50, 3, 4, 5).astype(np.float32)
W = np.random.rand(50).astype(np.float32)
indices = np.random.randint(0, 50, size=10).astype(np.int64)
L = np.asarray([4, 4, 2]).astype(np.int32)
op = core.CreateOperator(
"SparseLengthsPositionalWeightedSum",
["D", "W", "indices", "L"],
"out")
def ref_sparse(D, W, indices, L):
workspace.FeedBlob("L", L)
lengths_range_fill_op = core.CreateOperator(
"LengthsRangeFill", ["L"], ["L_pos_seq"])
workspace.RunOperatorOnce(lengths_range_fill_op)
workspace.FeedBlob("W", W)
gather_op = core.CreateOperator(
"Gather", ["W", "L_pos_seq"], ["W_gathered"])
workspace.RunOperatorOnce(gather_op)
workspace.FeedBlob("D", D)
workspace.FeedBlob("indices", indices)
sparse_op = core.CreateOperator(
"SparseLengthsWeightedSum",
["D", "W_gathered", "indices", "L"],
"out_ref")
workspace.RunOperatorOnce(sparse_op)
return (workspace.FetchBlob("out_ref"),)
self.assertReferenceChecks(
gc, op, [D, W, indices, L], ref_sparse)
@unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
@given(
input=hu.tensor(min_dim=2, max_dim=2, max_value=20, dtype=np.float16),
data_strategy=st.data(),
is_mean=st.booleans(),
**hu.gcs
)
@settings(deadline=None)
def test_sparse_lengths_fp16(self, input, data_strategy, is_mean, gc, dc):
m = input.shape[0]
lengths = data_strategy.draw(
hu.tensor(
max_dim=1,
max_value=input.shape[0],
dtype=np.int32,
elements=st.integers(min_value=0, max_value=27),
)
)
lengths_sum = int(np.sum(lengths).item())
indices = data_strategy.draw(
hu.arrays(
[lengths_sum], dtype=np.int64, elements=st.sampled_from(np.arange(m))
)
)
if is_mean:
op = core.CreateOperator(
"SparseLengthsMean", ["input", "indices", "lengths"], "out"
)
self.assertReferenceChecks(gc, op, [input, indices, lengths], sparse_lengths_mean_ref)
else:
op = core.CreateOperator(
"SparseLengthsSum", ["input", "indices", "lengths"], "out"
)
self.assertReferenceChecks(gc, op, [input, indices, lengths], sparse_lengths_sum_ref)
2018-03-31 03:12:02 +00:00
# @given(
# inputs=hu.lengths_tensor(
# dtype=np.float32,
# min_value=1,
# max_value=5,
# min_dim=1,
# max_dim=1,
# allow_empty=False,
# ),
# **hu.gcs
# )
# def test_lengths_max_gpu(self, inputs, gc, dc):
# def lengths_max_ref(I, L):
# R = np.zeros(shape=(len(L)), dtype=I.dtype)
# line = 0
# for g in range(len(L)):
# for i in range(L[g]):
# if i == 0:
# R[g] = I[line]
# else:
# R[g] = max(R[g], I[line])
# line += 1
# return [R]
# X, lengths = inputs
# op = core.CreateOperator("LengthsMax", ["X", "lengths"], "out")
# self.assertDeviceChecks(dc, op, [X, lengths], [0])
# self.assertReferenceChecks(
# device_option=gc,
# op=op,
# inputs=[X, lengths],
# reference=lengths_max_ref,
# threshold=1e-4,
# output_to_grad='out',
# )
if __name__ == "__main__":
import unittest
unittest.main()