2020-09-24 00:55:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-03-26 18:14:26 +00:00
|
|
|
|
2016-07-21 18:26:41 +00:00
|
|
|
from functools import partial
|
2020-08-08 19:10:52 +00:00
|
|
|
from hypothesis import given, settings
|
2016-10-07 20:08:53 +00:00
|
|
|
|
2016-07-21 18:26:41 +00:00
|
|
|
import numpy as np
|
2017-05-26 16:16:09 +00:00
|
|
|
import unittest
|
2018-03-26 18:14:26 +00:00
|
|
|
import hypothesis.strategies as st
|
|
|
|
|
|
|
|
|
|
from caffe2.python import core, workspace
|
|
|
|
|
import caffe2.python.hypothesis_test_util as hu
|
2018-09-19 16:58:15 +00:00
|
|
|
import caffe2.python.serialized_test.serialized_test_util as serial
|
2018-03-26 18:14:26 +00:00
|
|
|
|
2020-09-15 17:44:23 +00:00
|
|
|
def sparse_lengths_sum_ref(D, I, L, normalize_by_lengths=False):
|
|
|
|
|
R = np.zeros(shape=(L.size,) + D.shape[1:], dtype=np.float32)
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(L.size):
|
|
|
|
|
for _ in range(L[g]):
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] += D[I[line], :]
|
|
|
|
|
else:
|
|
|
|
|
R[g] += D[I[line]]
|
|
|
|
|
line += 1
|
|
|
|
|
|
|
|
|
|
if normalize_by_lengths and L[g] > 1:
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] = R[g, :] / L[g]
|
|
|
|
|
else:
|
|
|
|
|
R[g] = R[g] / L[g]
|
|
|
|
|
|
|
|
|
|
return [R]
|
|
|
|
|
|
|
|
|
|
def sparse_lengths_mean_ref(D, I, L):
|
|
|
|
|
return sparse_lengths_sum_ref(D, I, L, normalize_by_lengths=True)
|
|
|
|
|
|
2016-07-21 18:26:41 +00:00
|
|
|
|
2016-10-07 20:08:53 +00:00
|
|
|
class TesterBase:
|
|
|
|
|
def segment_reduce_op(self, data, segment_ids, reducer, indices=None):
|
|
|
|
|
segments = self.split(data, segment_ids, indices)
|
|
|
|
|
output = np.zeros((len(segments), ) + data.shape[1:])
|
|
|
|
|
for i, segment in enumerate(segments):
|
2016-11-29 00:16:58 +00:00
|
|
|
if len(segment) > 0:
|
|
|
|
|
output[i] = reducer(segment)
|
|
|
|
|
else:
|
|
|
|
|
output[i] = 0.0
|
2016-10-07 20:08:53 +00:00
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
def segment_reduce_grad_op(
|
|
|
|
|
self,
|
|
|
|
|
data,
|
|
|
|
|
segment_ids,
|
|
|
|
|
reducer_grad,
|
|
|
|
|
grad_out,
|
|
|
|
|
output,
|
|
|
|
|
indices=None
|
|
|
|
|
):
|
|
|
|
|
segments = self.split(data, segment_ids, indices)
|
|
|
|
|
segment_grads = [
|
|
|
|
|
reducer_grad(grad_out[i], [output[i]], [segment])
|
|
|
|
|
for i, segment in enumerate(segments)
|
|
|
|
|
]
|
|
|
|
|
return self.unsplit(data.shape[1:], segment_grads, segment_ids)
|
|
|
|
|
|
2017-05-26 16:16:09 +00:00
|
|
|
def _test(self, prefix, input_strategy, refs, gpu=False, **kwargs):
|
2016-10-07 20:08:53 +00:00
|
|
|
tester = self
|
2017-02-22 18:59:01 +00:00
|
|
|
operator_args = kwargs.pop('operator_args', {})
|
|
|
|
|
threshold = kwargs.pop('threshold', 1e-4)
|
2017-02-28 19:16:36 +00:00
|
|
|
grad_check = kwargs.pop('grad_check', True)
|
2016-10-07 20:08:53 +00:00
|
|
|
|
2017-05-26 16:16:09 +00:00
|
|
|
@given(X=input_strategy, **hu.gcs)
|
2016-10-07 20:08:53 +00:00
|
|
|
def test_segment_ops(self, X, gc, dc):
|
2017-05-26 16:16:09 +00:00
|
|
|
if not gpu and gc.device_type > 0:
|
|
|
|
|
return
|
2016-10-07 20:08:53 +00:00
|
|
|
for op_name, ref, grad_ref in refs:
|
|
|
|
|
inputs = ['input%d' % i for i in range(0, len(X))]
|
2017-02-22 18:59:01 +00:00
|
|
|
op = core.CreateOperator(
|
|
|
|
|
prefix + op_name, inputs, ['output'], **operator_args
|
|
|
|
|
)
|
2017-12-13 07:16:57 +00:00
|
|
|
print('Operator %s, ' % op.type, gc.device_type)
|
2016-10-07 20:08:53 +00:00
|
|
|
|
|
|
|
|
def seg_reduce(data, *args):
|
|
|
|
|
indices, segments = (
|
|
|
|
|
args if len(args) == 2 else (None, args[0])
|
|
|
|
|
)
|
|
|
|
|
out = tester.segment_reduce_op(
|
|
|
|
|
data=data,
|
|
|
|
|
segment_ids=segments,
|
|
|
|
|
indices=indices,
|
|
|
|
|
reducer=ref
|
|
|
|
|
)
|
|
|
|
|
return (out, )
|
|
|
|
|
|
|
|
|
|
def seg_reduce_grad(grad_out, outputs, inputs):
|
|
|
|
|
data = inputs[0]
|
|
|
|
|
args = inputs[1:]
|
|
|
|
|
indices, segments = (
|
|
|
|
|
args if len(args) == 2 else (None, args[0])
|
|
|
|
|
)
|
|
|
|
|
# grad r.t. data
|
|
|
|
|
grad_val = tester.segment_reduce_grad_op(
|
|
|
|
|
data, segments, grad_ref, grad_out, outputs[0], indices
|
|
|
|
|
)
|
|
|
|
|
# if sparse, include indices along with data gradient
|
|
|
|
|
data_grad_slice = (
|
|
|
|
|
(grad_val, indices) if indices is not None else grad_val
|
|
|
|
|
)
|
|
|
|
|
# other inputs don't have gradient
|
|
|
|
|
return (data_grad_slice, ) + (None, ) * (len(inputs) - 1)
|
|
|
|
|
|
2017-02-28 19:16:36 +00:00
|
|
|
kwargs = {}
|
|
|
|
|
if grad_check:
|
|
|
|
|
kwargs['output_to_grad'] = 'output'
|
|
|
|
|
kwargs['grad_reference'] = seg_reduce_grad
|
2016-10-07 20:08:53 +00:00
|
|
|
self.assertReferenceChecks(
|
|
|
|
|
device_option=gc,
|
|
|
|
|
op=op,
|
|
|
|
|
inputs=X,
|
|
|
|
|
reference=seg_reduce,
|
2017-02-22 18:59:01 +00:00
|
|
|
threshold=threshold,
|
2017-02-28 19:16:36 +00:00
|
|
|
**kwargs
|
2016-10-07 20:08:53 +00:00
|
|
|
)
|
|
|
|
|
return test_segment_ops
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SegmentsTester(TesterBase):
|
|
|
|
|
def split(self, data, segment_ids, indices=None):
|
|
|
|
|
"""
|
|
|
|
|
Given:
|
|
|
|
|
data[M1 x M2 x ... x Md]
|
|
|
|
|
the input data
|
|
|
|
|
indices[N] the index of each entry of segment_ids into data,
|
|
|
|
|
where 0 <= index[i] < M1,
|
|
|
|
|
with default indices=[0,1,...N]
|
|
|
|
|
segment_ids[N] the segment_id for each entry of indices,
|
|
|
|
|
|
|
|
|
|
returns K outputs, each one containing data entries corresponding
|
|
|
|
|
to one of the segments present in `segment_ids`.
|
|
|
|
|
"""
|
|
|
|
|
if segment_ids.size == 0:
|
|
|
|
|
return []
|
|
|
|
|
K = max(segment_ids) + 1
|
|
|
|
|
outputs = [
|
|
|
|
|
np.zeros(
|
|
|
|
|
(np.count_nonzero(segment_ids == seg_id), ) + data.shape[1:],
|
|
|
|
|
dtype=data.dtype
|
|
|
|
|
) for seg_id in range(0, K)
|
|
|
|
|
]
|
2017-03-06 18:32:44 +00:00
|
|
|
counts = np.zeros(K, dtype=int)
|
2016-10-07 20:08:53 +00:00
|
|
|
for i, seg_id in enumerate(segment_ids):
|
|
|
|
|
data_idx = i if indices is None else indices[i]
|
|
|
|
|
outputs[seg_id][counts[seg_id]] = data[data_idx]
|
|
|
|
|
counts[seg_id] += 1
|
|
|
|
|
return outputs
|
|
|
|
|
|
|
|
|
|
def unsplit(self, extra_shape, inputs, segment_ids):
|
|
|
|
|
""" Inverse operation to `split`, with indices=None """
|
|
|
|
|
output = np.zeros((len(segment_ids), ) + extra_shape)
|
|
|
|
|
if len(segment_ids) == 0:
|
|
|
|
|
return output
|
|
|
|
|
K = max(segment_ids) + 1
|
2017-03-06 18:32:44 +00:00
|
|
|
counts = np.zeros(K, dtype=int)
|
2016-10-07 20:08:53 +00:00
|
|
|
for i, seg_id in enumerate(segment_ids):
|
|
|
|
|
output[i] = inputs[seg_id][counts[seg_id]]
|
|
|
|
|
counts[seg_id] += 1
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LengthsTester(TesterBase):
|
|
|
|
|
def split(self, data, lengths, indices=None):
|
|
|
|
|
K = len(lengths)
|
|
|
|
|
outputs = [
|
|
|
|
|
np.zeros((lengths[seg_id], ) + data.shape[1:],
|
|
|
|
|
dtype=data.dtype) for seg_id in range(0, K)
|
|
|
|
|
]
|
|
|
|
|
start = 0
|
|
|
|
|
for i in range(0, K):
|
|
|
|
|
for j in range(0, lengths[i]):
|
|
|
|
|
data_index = start + j
|
|
|
|
|
if indices is not None:
|
|
|
|
|
data_index = indices[data_index]
|
|
|
|
|
outputs[i][j] = data[data_index]
|
|
|
|
|
start += lengths[i]
|
|
|
|
|
return outputs
|
|
|
|
|
|
|
|
|
|
def unsplit(self, extra_shape, inputs, lengths):
|
|
|
|
|
N = sum(lengths)
|
|
|
|
|
output = np.zeros((N, ) + extra_shape)
|
|
|
|
|
K = len(lengths)
|
|
|
|
|
assert len(inputs) == K
|
|
|
|
|
current = 0
|
|
|
|
|
for i in range(0, K):
|
|
|
|
|
for j in range(0, lengths[i]):
|
|
|
|
|
output[current] = inputs[i][j]
|
|
|
|
|
current += 1
|
|
|
|
|
return output
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def sum_grad(grad_out, outputs, inputs):
|
|
|
|
|
return np.repeat(
|
|
|
|
|
np.expand_dims(grad_out, axis=0),
|
|
|
|
|
inputs[0].shape[0],
|
2016-10-07 20:08:53 +00:00
|
|
|
axis=0
|
|
|
|
|
)
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def logsumexp(x):
|
|
|
|
|
return np.log(np.sum(np.exp(x), axis=0))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def logsumexp_grad(grad_out, outputs, inputs):
|
|
|
|
|
sum_exps = np.sum(np.exp(inputs[0]), axis=0)
|
|
|
|
|
return np.repeat(
|
|
|
|
|
np.expand_dims(grad_out / sum_exps, 0),
|
|
|
|
|
inputs[0].shape[0],
|
2016-10-07 20:08:53 +00:00
|
|
|
axis=0
|
|
|
|
|
) * np.exp(inputs[0])
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
|
2016-07-28 22:06:04 +00:00
|
|
|
def logmeanexp(x):
|
|
|
|
|
return np.log(np.mean(np.exp(x), axis=0))
|
|
|
|
|
|
|
|
|
|
|
2016-07-21 18:26:41 +00:00
|
|
|
def mean(x):
|
|
|
|
|
return np.mean(x, axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mean_grad(grad_out, outputs, inputs):
|
|
|
|
|
return np.repeat(
|
|
|
|
|
np.expand_dims(grad_out / inputs[0].shape[0], 0),
|
|
|
|
|
inputs[0].shape[0],
|
2016-10-07 20:08:53 +00:00
|
|
|
axis=0
|
|
|
|
|
)
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
|
2016-10-07 20:08:53 +00:00
|
|
|
def max_fwd(x):
|
2016-07-28 22:06:04 +00:00
|
|
|
return np.amax(x, axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def max_grad(grad_out, outputs, inputs):
|
|
|
|
|
flat_inputs = inputs[0].flatten()
|
|
|
|
|
flat_outputs = np.array(outputs[0]).flatten()
|
|
|
|
|
flat_grad_in = np.zeros(flat_inputs.shape)
|
|
|
|
|
flat_grad_out = np.array(grad_out).flatten()
|
|
|
|
|
blocks = inputs[0].shape[0]
|
2017-09-13 02:49:15 +00:00
|
|
|
if blocks == 0:
|
|
|
|
|
return np.zeros(inputs[0].shape)
|
2016-07-28 22:06:04 +00:00
|
|
|
block_size = flat_inputs.shape[0] // blocks
|
|
|
|
|
|
|
|
|
|
for i in range(block_size):
|
|
|
|
|
out_grad = flat_grad_out[i]
|
|
|
|
|
out = flat_outputs[i]
|
|
|
|
|
for j in range(blocks):
|
|
|
|
|
idx = j * block_size + i
|
2017-09-20 17:52:08 +00:00
|
|
|
# we can produce multiple outputs for max
|
2016-07-28 22:06:04 +00:00
|
|
|
if out == flat_inputs[idx]:
|
|
|
|
|
flat_grad_in[idx] = out_grad
|
|
|
|
|
|
|
|
|
|
return np.resize(flat_grad_in, inputs[0].shape)
|
|
|
|
|
|
|
|
|
|
|
2016-11-29 00:16:58 +00:00
|
|
|
REFERENCES_ALL = [
|
|
|
|
|
('Sum', partial(np.sum, axis=0), sum_grad),
|
|
|
|
|
('Mean', partial(np.mean, axis=0), mean_grad),
|
|
|
|
|
]
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
REFERENCES_SORTED = [
|
|
|
|
|
('RangeSum', partial(np.sum, axis=0), sum_grad),
|
|
|
|
|
('RangeLogSumExp', logsumexp, logsumexp_grad),
|
2016-07-28 22:06:04 +00:00
|
|
|
# gradient is the same as sum
|
|
|
|
|
('RangeLogMeanExp', logmeanexp, logsumexp_grad),
|
2016-07-21 18:26:41 +00:00
|
|
|
('RangeMean', mean, mean_grad),
|
2016-10-07 20:08:53 +00:00
|
|
|
('RangeMax', max_fwd, max_grad),
|
2016-07-21 18:26:41 +00:00
|
|
|
]
|
|
|
|
|
|
2017-09-13 02:49:15 +00:00
|
|
|
REFERENCES_LENGTHS_ONLY = [
|
|
|
|
|
('Max', partial(np.amax, axis=0), max_grad),
|
|
|
|
|
]
|
2016-07-21 18:26:41 +00:00
|
|
|
|
2018-03-26 18:14:26 +00:00
|
|
|
|
2017-08-22 22:19:03 +00:00
|
|
|
def sparse_lengths_weighted_sum_ref(D, W, I, L):
|
|
|
|
|
R = np.zeros(shape=(len(L), ) + D.shape[1:], dtype=D.dtype)
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(len(L)):
|
|
|
|
|
for _ in range(L[g]):
|
2018-03-26 18:14:26 +00:00
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] += W[line] * D[I[line], :]
|
|
|
|
|
else:
|
|
|
|
|
R[g] += W[line] * D[I[line]]
|
2017-08-22 22:19:03 +00:00
|
|
|
line += 1
|
|
|
|
|
return [R]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sparse_lengths_weighted_sum_grad_ref(
|
|
|
|
|
GO, fwd_out, fwd_in, grad_on_weights=False):
|
|
|
|
|
D, W, I, L = fwd_in
|
|
|
|
|
GI = np.zeros(shape=(len(I), ) + D.shape[1:], dtype=D.dtype)
|
|
|
|
|
GW = np.zeros(shape=W.shape, dtype=W.dtype) if grad_on_weights else None
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(len(L)):
|
|
|
|
|
for _ in range(L[g]):
|
2018-03-26 18:14:26 +00:00
|
|
|
if len(GO.shape) > 1:
|
|
|
|
|
GI[line, :] = W[line] * GO[g, :]
|
|
|
|
|
else:
|
|
|
|
|
GI[line] = W[line] * GO[g]
|
2017-08-22 22:19:03 +00:00
|
|
|
if GW is not None:
|
2018-03-26 18:14:26 +00:00
|
|
|
if len(GO.shape) > 1:
|
|
|
|
|
GW[line] = np.dot(GO[g].flatten(), D[I[line], :].flatten())
|
|
|
|
|
else:
|
|
|
|
|
GW[line] = np.dot(GO[g].flatten(), D[I[line]].flatten())
|
2017-08-22 22:19:03 +00:00
|
|
|
line += 1
|
|
|
|
|
print(GW)
|
|
|
|
|
return [(GI, I), GW, None, None]
|
|
|
|
|
|
|
|
|
|
|
2016-07-21 18:26:41 +00:00
|
|
|
class TestSegmentOps(hu.HypothesisTestCase):
|
|
|
|
|
def test_sorted_segment_ops(self):
|
2017-03-15 01:12:12 +00:00
|
|
|
SegmentsTester()._test(
|
2016-07-21 18:26:41 +00:00
|
|
|
'SortedSegment',
|
2016-10-07 20:08:53 +00:00
|
|
|
hu.segmented_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
is_sorted=True,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
|
|
|
|
REFERENCES_ALL + REFERENCES_SORTED
|
|
|
|
|
)(self)
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
def test_unsorted_segment_ops(self):
|
2017-03-15 01:12:12 +00:00
|
|
|
SegmentsTester()._test(
|
2016-07-21 18:26:41 +00:00
|
|
|
'UnsortedSegment',
|
2016-10-07 20:08:53 +00:00
|
|
|
hu.segmented_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
is_sorted=False,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
2017-05-26 16:16:09 +00:00
|
|
|
REFERENCES_ALL,
|
|
|
|
|
)(self)
|
|
|
|
|
|
|
|
|
|
def test_unsorted_segment_ops_gpu(self):
|
|
|
|
|
SegmentsTester()._test(
|
|
|
|
|
'UnsortedSegment',
|
|
|
|
|
hu.segmented_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
is_sorted=False,
|
|
|
|
|
allow_empty=True,
|
|
|
|
|
),
|
|
|
|
|
REFERENCES_ALL,
|
2017-09-20 17:52:08 +00:00
|
|
|
gpu=workspace.has_gpu_support,
|
2017-05-26 16:16:09 +00:00
|
|
|
grad_check=False,
|
2016-10-07 20:08:53 +00:00
|
|
|
)(self)
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
def test_sparse_sorted_segment_ops(self):
|
2017-03-15 01:12:12 +00:00
|
|
|
SegmentsTester()._test(
|
2016-07-21 18:26:41 +00:00
|
|
|
'SparseSortedSegment',
|
2016-10-07 20:08:53 +00:00
|
|
|
hu.sparse_segmented_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
is_sorted=True,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
|
|
|
|
REFERENCES_ALL
|
|
|
|
|
)(self)
|
2016-07-21 18:26:41 +00:00
|
|
|
|
|
|
|
|
def test_sparse_unsorted_segment_ops(self):
|
2017-03-15 01:12:12 +00:00
|
|
|
SegmentsTester()._test(
|
2016-07-21 18:26:41 +00:00
|
|
|
'SparseUnsortedSegment',
|
2016-10-07 20:08:53 +00:00
|
|
|
hu.sparse_segmented_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
is_sorted=False,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
|
|
|
|
REFERENCES_ALL
|
|
|
|
|
)(self)
|
|
|
|
|
|
|
|
|
|
def test_lengths_ops(self):
|
2017-03-15 01:12:12 +00:00
|
|
|
LengthsTester()._test(
|
2016-10-07 20:08:53 +00:00
|
|
|
'Lengths',
|
|
|
|
|
hu.lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
2017-06-01 03:14:23 +00:00
|
|
|
max_value=5,
|
2016-10-07 20:08:53 +00:00
|
|
|
allow_empty=True
|
|
|
|
|
),
|
2018-03-26 18:14:26 +00:00
|
|
|
REFERENCES_ALL + REFERENCES_LENGTHS_ONLY,
|
2016-10-07 20:08:53 +00:00
|
|
|
)(self)
|
|
|
|
|
|
|
|
|
|
def test_sparse_lengths_ops(self):
|
2017-07-25 00:37:58 +00:00
|
|
|
for itype in [np.int32, np.int64]:
|
|
|
|
|
LengthsTester()._test(
|
|
|
|
|
'SparseLengths',
|
|
|
|
|
hu.sparse_lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
|
|
|
|
max_value=5,
|
|
|
|
|
allow_empty=True,
|
|
|
|
|
itype=itype,
|
|
|
|
|
),
|
2018-03-26 18:14:26 +00:00
|
|
|
REFERENCES_ALL,
|
2017-07-25 00:37:58 +00:00
|
|
|
)(self)
|
2016-11-23 02:31:47 +00:00
|
|
|
|
2017-05-26 16:16:09 +00:00
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
|
|
|
|
|
@given(**hu.gcs)
|
|
|
|
|
def test_unsorted_sums_large(self, gc, dc):
|
|
|
|
|
X = np.random.rand(10000, 32, 12).astype(np.float32)
|
|
|
|
|
segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
|
|
|
|
|
op = core.CreateOperator("UnsortedSegmentSum", ["X", "segments"], "out")
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, segments], [0])
|
|
|
|
|
|
2017-12-13 07:16:57 +00:00
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
|
|
|
|
|
@given(**hu.gcs)
|
|
|
|
|
def test_sorted_segment_range_mean(self, gc, dc):
|
|
|
|
|
X = np.random.rand(6, 32, 12).astype(np.float32)
|
|
|
|
|
segments = np.array([0, 0, 1, 1, 2, 3]).astype(np.int32)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SortedSegmentRangeMean",
|
|
|
|
|
["X", "segments"],
|
|
|
|
|
"out"
|
|
|
|
|
)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, segments], [0])
|
2017-12-14 20:59:24 +00:00
|
|
|
self.assertGradientChecks(gc, op, [X, segments], 0, [0])
|
2017-12-13 07:16:57 +00:00
|
|
|
|
|
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
|
|
|
|
|
@given(**hu.gcs)
|
|
|
|
|
def test_sorted_segment_range_log_mean_exp(self, gc, dc):
|
|
|
|
|
X = np.random.rand(7, 32, 12).astype(np.float32)
|
|
|
|
|
segments = np.array([0, 0, 1, 1, 2, 2, 3]).astype(np.int32)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SortedSegmentRangeLogMeanExp",
|
|
|
|
|
["X", "segments"],
|
|
|
|
|
"out"
|
|
|
|
|
)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, segments], [0])
|
2017-12-14 20:59:24 +00:00
|
|
|
self.assertGradientChecks(gc, op, [X, segments], 0, [0])
|
2017-12-13 07:16:57 +00:00
|
|
|
|
2017-05-26 16:16:09 +00:00
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
|
|
|
|
|
@given(**hu.gcs)
|
|
|
|
|
def test_unsorted_means_large(self, gc, dc):
|
|
|
|
|
X = np.random.rand(10000, 31, 19).astype(np.float32)
|
|
|
|
|
segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
|
|
|
|
|
op = core.CreateOperator("UnsortedSegmentMean", ["X", "segments"], "out")
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, segments], [0])
|
|
|
|
|
|
2018-09-19 16:58:15 +00:00
|
|
|
@serial.given(
|
2018-03-26 18:14:26 +00:00
|
|
|
inputs=hu.lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
|
|
|
|
max_value=5,
|
|
|
|
|
allow_empty=True,
|
|
|
|
|
),
|
|
|
|
|
**hu.gcs
|
|
|
|
|
)
|
|
|
|
|
def test_lengths_sum(self, inputs, gc, dc):
|
|
|
|
|
X, Y = inputs
|
2017-05-03 05:07:13 +00:00
|
|
|
op = core.CreateOperator("LengthsSum", ["X", "Y"], "out")
|
[Caffe2] Changes done inside Facebook (#6378)
* fix unit test for sqrt op
From the error logging:
[idx, grad, grad_estimate] are:
[[ 146. 0.5 0.45776367]
[ 147. 0.5 0.45776367]
The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; )
The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss)
This diff
- increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :)
- also clean up, and merge the test case for inplace Vs. non-inplace
Tested with:
`CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"`
* CompositeReader & CompositeReaderBuilder
A new type of reader gluing multiple readers together.
* Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid"
Original commit changeset: 9325a4356dbe
* [dai][WIP] convert params to int8 on ps before sending to trainer
Add float->uint8 conversion in addition to float->fp16 conversion in model_saver.
* [easy] improve unit test for sparse length sum ops
as desc.
#accept2ship
* Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24
* move sparse hash unique ops to OOS and add unit tests
- move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1
- The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2
- fix the CUDA UniqueOp for the case when batch is empty.
- add unit test
* group_norm_op for caffe2
This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494
This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel).
* Resubmit D7405233: disappeared in D7464958
OOS publish causes the op missing -- however, test was still there
* [c2] add sparse hash engine for cuda unique op
The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU.
* [dper][gpu] enable unit testing gpu trainer for sparse nn
to debug the GPU trainer using mock data in unit test.
make it easier to develop GPU trainer for new models.
* Reuse Gloo context for Synchronize() calls
Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts).
* [GanH/WGAN][1/n]: add FC param clipping
as titled
* [mobile] minimizing changes between caffe2_benchmark and speed_benchmark
* [GanH]: enable diagnose within model
avoid finding blob names but to directly enable inside the model
* Add `net_transformer_fun` option to DPM
This callback allows for various transformations to be made to the
model after gradient operators have been added. The immediate motivation for
this is to allow transformations such has "checkpoint-and-recompute" which
allow trading off memory for additional compute.
Adding several callbacks like this has made DPM's API less than ideal at this
stage. However, I could not find any reasonable alternative.
* [DT] [33/n] Compile flow task groups
task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary.
* Initial commit for sparse_normalize vectorization and benchmark
* [GanH]: LB Calibration for JSD
as titled
* Tracing event in async executor
Adding event tracing through TRACE_EVENT macro in async executor
* [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset
D7409751 got lost in D7464958
* Visualizing realtime weights values
we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index.
Currently, we assume the blob to be 2 dimensional.
* [GanH][Easy]: Fix Homotopy Weighting
apparantely, there was a bug in homotopy weight (alpha, beta) update
* [c2] move sparse hash unique op out of oss
so that oss do not need to depend on google hash map.
* Get rid of std::round as it's not supported on Android
* Revert changes on setup.py
* Skip shaky test on Dataio
* fix
2018-04-11 04:11:43 +00:00
|
|
|
|
|
|
|
|
def ref(D, L):
|
|
|
|
|
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(L.size):
|
|
|
|
|
for _ in range(L[g]):
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] += D[line, :]
|
|
|
|
|
else:
|
|
|
|
|
R[g] += D[line]
|
|
|
|
|
line += 1
|
|
|
|
|
return [R]
|
|
|
|
|
|
|
|
|
|
self.assertReferenceChecks(gc, op, [X, Y], ref)
|
2017-05-03 05:07:13 +00:00
|
|
|
self.assertDeviceChecks(dc, op, [X, Y], [0])
|
|
|
|
|
self.assertGradientChecks(gc, op, [X, Y], 0, [0])
|
|
|
|
|
|
2018-09-19 16:58:15 +00:00
|
|
|
@serial.given(
|
2018-03-26 18:14:26 +00:00
|
|
|
inputs=hu.sparse_lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
|
|
|
|
max_value=5,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
|
|
|
|
**hu.gcs
|
|
|
|
|
)
|
|
|
|
|
def test_sparse_lengths_sum(self, inputs, gc, dc):
|
|
|
|
|
X, Y, Z = inputs
|
2017-05-03 05:07:13 +00:00
|
|
|
op = core.CreateOperator("SparseLengthsSum", ["X", "Y", "Z"], "out")
|
[Caffe2] Changes done inside Facebook (#6378)
* fix unit test for sqrt op
From the error logging:
[idx, grad, grad_estimate] are:
[[ 146. 0.5 0.45776367]
[ 147. 0.5 0.45776367]
The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; )
The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss)
This diff
- increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :)
- also clean up, and merge the test case for inplace Vs. non-inplace
Tested with:
`CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"`
* CompositeReader & CompositeReaderBuilder
A new type of reader gluing multiple readers together.
* Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid"
Original commit changeset: 9325a4356dbe
* [dai][WIP] convert params to int8 on ps before sending to trainer
Add float->uint8 conversion in addition to float->fp16 conversion in model_saver.
* [easy] improve unit test for sparse length sum ops
as desc.
#accept2ship
* Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24
* move sparse hash unique ops to OOS and add unit tests
- move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1
- The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2
- fix the CUDA UniqueOp for the case when batch is empty.
- add unit test
* group_norm_op for caffe2
This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494
This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel).
* Resubmit D7405233: disappeared in D7464958
OOS publish causes the op missing -- however, test was still there
* [c2] add sparse hash engine for cuda unique op
The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU.
* [dper][gpu] enable unit testing gpu trainer for sparse nn
to debug the GPU trainer using mock data in unit test.
make it easier to develop GPU trainer for new models.
* Reuse Gloo context for Synchronize() calls
Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts).
* [GanH/WGAN][1/n]: add FC param clipping
as titled
* [mobile] minimizing changes between caffe2_benchmark and speed_benchmark
* [GanH]: enable diagnose within model
avoid finding blob names but to directly enable inside the model
* Add `net_transformer_fun` option to DPM
This callback allows for various transformations to be made to the
model after gradient operators have been added. The immediate motivation for
this is to allow transformations such has "checkpoint-and-recompute" which
allow trading off memory for additional compute.
Adding several callbacks like this has made DPM's API less than ideal at this
stage. However, I could not find any reasonable alternative.
* [DT] [33/n] Compile flow task groups
task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary.
* Initial commit for sparse_normalize vectorization and benchmark
* [GanH]: LB Calibration for JSD
as titled
* Tracing event in async executor
Adding event tracing through TRACE_EVENT macro in async executor
* [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset
D7409751 got lost in D7464958
* Visualizing realtime weights values
we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index.
Currently, we assume the blob to be 2 dimensional.
* [GanH][Easy]: Fix Homotopy Weighting
apparantely, there was a bug in homotopy weight (alpha, beta) update
* [c2] move sparse hash unique op out of oss
so that oss do not need to depend on google hash map.
* Get rid of std::round as it's not supported on Android
* Revert changes on setup.py
* Skip shaky test on Dataio
* fix
2018-04-11 04:11:43 +00:00
|
|
|
|
|
|
|
|
def ref(D, I, L):
|
|
|
|
|
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(L.size):
|
|
|
|
|
for _ in range(L[g]):
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] += D[I[line], :]
|
|
|
|
|
else:
|
|
|
|
|
R[g] += D[I[line]]
|
|
|
|
|
line += 1
|
|
|
|
|
return [R]
|
|
|
|
|
|
|
|
|
|
self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
|
2017-05-03 05:07:13 +00:00
|
|
|
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
|
|
|
|
|
self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
|
|
|
|
|
|
2018-09-19 16:58:15 +00:00
|
|
|
@serial.given(
|
Update from facebook (#7696)
* Fix handling of empty batches in SumReduceDimsOp
As titled
* Deferrable async_scheduling finishRun fix
Proper order of finishing run operations in deferrable_async_scheduling net
* Simplify exception handling in async_scheduling
Simplify exception handling, no need to busy wait, thread that processes the
last task can finish the run
* [C2]worker_coordinator_memorize_worker_ids
As titled. This is related to T28689868, where the number of blobs we want to create is equal to the number of worker ids
* Add unit test for nets with no type set
* Ignore total length argument in sympolic_pad_packed_sequence
1- There was a mistake in the code that total_length was added to the wrong symbolic function (pack_padded_sequence) instead of (pad_packed_sequence)
2- No need to throw an exception if total_length is given since it is only used to enable data_parallel training on multi-gpus and doesn't have anything to do with onnx export, so just ignore it. https://fburl.com/tk4gciqp
* Add support for MKLDNN to async_scheduling
Just add MKLDNN as a possible CPU option to async_scheduling's pool function
* [AuFL][ensemble] support branch output for prediction
This diff supports using predictions from different branches and thus enables model ensembling (not fully independent).
* Fix a bug in add_loss in layer_model_helper
As titled.
* Support lradaption for adam
1.lr adaption operator
2.apply to dense adam
* Perf tweaks for async_scheduling
Restore single pool option + remove unnecessary (no-ops) calls
* add quantization to SparseSimdAdagradOp
add a bunch of quantization signatures to SparseSimdAdagradOp, implementations to come next
* [sr] [codemod] Change all SR callsites to use new API
@allow-large-files
This diff refactors all callsites of SR to use the slightly changed API introduced in the diff below. Really what this means is that you need to include the correct header. Also if you were using `ClientFactory::newFactory` you need to not prefix it with `ClientFactory::`.
```
cd ~/fbsource/fbcode
find ./ -type f -exec sed -i -e 's:#include "servicerouter/client/cpp2/ClientFactory.h":#include "servicerouter/client/cpp2/ServiceRouter.h":' -e 's:#include <servicerouter/client/cpp2/ClientFactory.h>:#include <servicerouter/client/cpp2/ServiceRouter.h>:' -e 's/ClientFactory::newFactory(/newFactory(/g' {} \;
```
Also manually fixed spots that couldn't be done automatically (or broke because they depended on transitive includes).
* Back out "Fix handling of empty batches in SumReduceDimsOp"
Original commit changeset: 282da1730cc2 This commit is blocking the
Github->fbcode sync, which really needs to get merged ASAP. D7881937 which this
diff depends on will be reverted in the sync D7990948 which causes this to
break. The sync diff cannot be patched with this reversion because it must be
landed against base revision 5c8c099 , and D7881937 must not be included in the
sync diff because it is breaking GPU tests that are not available in sandcastle
: https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-cuda8.0-cudnn6-ubuntu16.04-test/3638/console
for one example.
* Add the flow to support operator benchmark
1) generate model with the operator 2) upload to everstore 3) generate model spec into json file 4) start running the benchmark
* [tum][gpu] Connect DPM trainer with flow and unit tests
This diff:
- Fix some small bugs for Yiming's recent changes to parallelizer, so it suits real use cases.
- Add correct tags to the TUM code, so we can do data parallel transform
- pass extra info when instantiation.
- add unit test for using DPM in TUM model
After this diff, we can do simple box, multi-gpu fully-sync trainer for TUM in Fblearner workflow, but may still need to do speed benchmarking.
* w/o normalized lradaption for adam dense only
The previous lr adaption includes a normalization step when performing the dot product operation. This is not exactly same as what is proposed in the paper. I add normalization as an option. Without it, the operator performs exactly what the paper proposed. With the option, we add the normalization step
* [fb] Use SharedPromise in DeferrableAsyncSchedulingNet
This code is to simplify DeferrableAsyncSchedulingNet by removing condition
variable + small fixes
* [tum] implement cuda sparseLengthsMean and LengthsMean
as title
* Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function.
Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function.
* Move feature_to_index to FeatureSpec.feature_to_index
move feature_to_index to FeatureSpec.feature_to_index to avoid override other fields
* [Caffe2] Rename bytes_moved to bytes_written
Just a rename in preparation for supporting bytes_read.
* [c2] fix ReduceFrontSumOp for empty case by setting 0
otherwise, it may use the results from last iteration when it's empty batch.
* [Caffe2] [Int8] Improve Intel CPU performance
* [Easy] Improve PrependDim op logging
as titled
* DBFileReader expand db_path using os.path.expanduser(..)
Since there are a lot of possible use cases of `DBFileReader` to read from user home path, like `~/local/sample.db`, I want to save people's trouble of calling `os.path.expanduser(db_path)` themselves.
* [Caffe2] Add bytes_read to cost structure
We're adding analytical read bytes to cost functions. This extends the structure accordingly for all CostInference defined operators.
Additionally, some small bug fixes were performed:
1) Cost functions now extract type information of operands instead of assuming float
* Fix sleef on aarch64 for hhvm
@bypass-lint
Rename flag
* Remove duplicated part in caffe2/ideep/operators/conv_op.cc
should be sync error
* Rename test helper function test_adagrad_sparse_helper to adagrad_sparse_test_helper to avoid confusing pytest
2018-05-20 06:10:48 +00:00
|
|
|
inputs=hu.lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
|
|
|
|
max_value=5,
|
|
|
|
|
allow_empty=True,
|
|
|
|
|
),
|
|
|
|
|
**hu.gcs
|
|
|
|
|
)
|
|
|
|
|
def test_lengths_mean(self, inputs, gc, dc):
|
|
|
|
|
X, Y = inputs
|
|
|
|
|
op = core.CreateOperator("LengthsMean", ["X", "Y"], "out")
|
|
|
|
|
|
|
|
|
|
def ref(D, L):
|
|
|
|
|
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(L.size):
|
|
|
|
|
for _ in range(L[g]):
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] += D[line, :]
|
|
|
|
|
else:
|
|
|
|
|
R[g] += D[line]
|
|
|
|
|
line += 1
|
|
|
|
|
if L[g] > 1:
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] = R[g, :] / L[g]
|
|
|
|
|
else:
|
|
|
|
|
R[g] = R[g] / L[g]
|
|
|
|
|
|
|
|
|
|
return [R]
|
|
|
|
|
|
|
|
|
|
self.assertReferenceChecks(gc, op, [X, Y], ref)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, Y], [0])
|
|
|
|
|
self.assertGradientChecks(gc, op, [X, Y], 0, [0])
|
|
|
|
|
|
2018-09-19 16:58:15 +00:00
|
|
|
@serial.given(
|
Update from facebook (#7696)
* Fix handling of empty batches in SumReduceDimsOp
As titled
* Deferrable async_scheduling finishRun fix
Proper order of finishing run operations in deferrable_async_scheduling net
* Simplify exception handling in async_scheduling
Simplify exception handling, no need to busy wait, thread that processes the
last task can finish the run
* [C2]worker_coordinator_memorize_worker_ids
As titled. This is related to T28689868, where the number of blobs we want to create is equal to the number of worker ids
* Add unit test for nets with no type set
* Ignore total length argument in sympolic_pad_packed_sequence
1- There was a mistake in the code that total_length was added to the wrong symbolic function (pack_padded_sequence) instead of (pad_packed_sequence)
2- No need to throw an exception if total_length is given since it is only used to enable data_parallel training on multi-gpus and doesn't have anything to do with onnx export, so just ignore it. https://fburl.com/tk4gciqp
* Add support for MKLDNN to async_scheduling
Just add MKLDNN as a possible CPU option to async_scheduling's pool function
* [AuFL][ensemble] support branch output for prediction
This diff supports using predictions from different branches and thus enables model ensembling (not fully independent).
* Fix a bug in add_loss in layer_model_helper
As titled.
* Support lradaption for adam
1.lr adaption operator
2.apply to dense adam
* Perf tweaks for async_scheduling
Restore single pool option + remove unnecessary (no-ops) calls
* add quantization to SparseSimdAdagradOp
add a bunch of quantization signatures to SparseSimdAdagradOp, implementations to come next
* [sr] [codemod] Change all SR callsites to use new API
@allow-large-files
This diff refactors all callsites of SR to use the slightly changed API introduced in the diff below. Really what this means is that you need to include the correct header. Also if you were using `ClientFactory::newFactory` you need to not prefix it with `ClientFactory::`.
```
cd ~/fbsource/fbcode
find ./ -type f -exec sed -i -e 's:#include "servicerouter/client/cpp2/ClientFactory.h":#include "servicerouter/client/cpp2/ServiceRouter.h":' -e 's:#include <servicerouter/client/cpp2/ClientFactory.h>:#include <servicerouter/client/cpp2/ServiceRouter.h>:' -e 's/ClientFactory::newFactory(/newFactory(/g' {} \;
```
Also manually fixed spots that couldn't be done automatically (or broke because they depended on transitive includes).
* Back out "Fix handling of empty batches in SumReduceDimsOp"
Original commit changeset: 282da1730cc2 This commit is blocking the
Github->fbcode sync, which really needs to get merged ASAP. D7881937 which this
diff depends on will be reverted in the sync D7990948 which causes this to
break. The sync diff cannot be patched with this reversion because it must be
landed against base revision 5c8c099 , and D7881937 must not be included in the
sync diff because it is breaking GPU tests that are not available in sandcastle
: https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-cuda8.0-cudnn6-ubuntu16.04-test/3638/console
for one example.
* Add the flow to support operator benchmark
1) generate model with the operator 2) upload to everstore 3) generate model spec into json file 4) start running the benchmark
* [tum][gpu] Connect DPM trainer with flow and unit tests
This diff:
- Fix some small bugs for Yiming's recent changes to parallelizer, so it suits real use cases.
- Add correct tags to the TUM code, so we can do data parallel transform
- pass extra info when instantiation.
- add unit test for using DPM in TUM model
After this diff, we can do simple box, multi-gpu fully-sync trainer for TUM in Fblearner workflow, but may still need to do speed benchmarking.
* w/o normalized lradaption for adam dense only
The previous lr adaption includes a normalization step when performing the dot product operation. This is not exactly same as what is proposed in the paper. I add normalization as an option. Without it, the operator performs exactly what the paper proposed. With the option, we add the normalization step
* [fb] Use SharedPromise in DeferrableAsyncSchedulingNet
This code is to simplify DeferrableAsyncSchedulingNet by removing condition
variable + small fixes
* [tum] implement cuda sparseLengthsMean and LengthsMean
as title
* Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function.
Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function.
* Move feature_to_index to FeatureSpec.feature_to_index
move feature_to_index to FeatureSpec.feature_to_index to avoid override other fields
* [Caffe2] Rename bytes_moved to bytes_written
Just a rename in preparation for supporting bytes_read.
* [c2] fix ReduceFrontSumOp for empty case by setting 0
otherwise, it may use the results from last iteration when it's empty batch.
* [Caffe2] [Int8] Improve Intel CPU performance
* [Easy] Improve PrependDim op logging
as titled
* DBFileReader expand db_path using os.path.expanduser(..)
Since there are a lot of possible use cases of `DBFileReader` to read from user home path, like `~/local/sample.db`, I want to save people's trouble of calling `os.path.expanduser(db_path)` themselves.
* [Caffe2] Add bytes_read to cost structure
We're adding analytical read bytes to cost functions. This extends the structure accordingly for all CostInference defined operators.
Additionally, some small bug fixes were performed:
1) Cost functions now extract type information of operands instead of assuming float
* Fix sleef on aarch64 for hhvm
@bypass-lint
Rename flag
* Remove duplicated part in caffe2/ideep/operators/conv_op.cc
should be sync error
* Rename test helper function test_adagrad_sparse_helper to adagrad_sparse_test_helper to avoid confusing pytest
2018-05-20 06:10:48 +00:00
|
|
|
inputs=hu.sparse_lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
|
|
|
|
max_value=5,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
|
|
|
|
**hu.gcs
|
|
|
|
|
)
|
|
|
|
|
def test_sparse_lengths_mean(self, inputs, gc, dc):
|
|
|
|
|
X, Y, Z = inputs
|
|
|
|
|
op = core.CreateOperator("SparseLengthsMean", ["X", "Y", "Z"], "out")
|
|
|
|
|
|
|
|
|
|
def ref(D, I, L):
|
|
|
|
|
R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
|
|
|
|
|
line = 0
|
|
|
|
|
for g in range(L.size):
|
|
|
|
|
for _ in range(L[g]):
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] += D[I[line], :]
|
|
|
|
|
else:
|
|
|
|
|
R[g] += D[I[line]]
|
|
|
|
|
line += 1
|
|
|
|
|
|
|
|
|
|
if L[g] > 1:
|
|
|
|
|
if len(D.shape) > 1:
|
|
|
|
|
R[g, :] = R[g, :] / L[g]
|
|
|
|
|
else:
|
|
|
|
|
R[g] = R[g] / L[g]
|
|
|
|
|
|
|
|
|
|
return [R]
|
|
|
|
|
|
|
|
|
|
self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
|
|
|
|
|
self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
|
|
|
|
|
|
2018-09-19 16:58:15 +00:00
|
|
|
@serial.given(
|
2018-03-26 18:14:26 +00:00
|
|
|
grad_on_weights=st.booleans(),
|
|
|
|
|
inputs=hu.sparse_lengths_tensor(
|
|
|
|
|
dtype=np.float32,
|
|
|
|
|
min_value=1,
|
|
|
|
|
max_value=5,
|
|
|
|
|
allow_empty=True
|
|
|
|
|
),
|
[Caffe2] Changes done inside Facebook (#6378)
* fix unit test for sqrt op
From the error logging:
[idx, grad, grad_estimate] are:
[[ 146. 0.5 0.45776367]
[ 147. 0.5 0.45776367]
The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; )
The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss)
This diff
- increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :)
- also clean up, and merge the test case for inplace Vs. non-inplace
Tested with:
`CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"`
* CompositeReader & CompositeReaderBuilder
A new type of reader gluing multiple readers together.
* Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid"
Original commit changeset: 9325a4356dbe
* [dai][WIP] convert params to int8 on ps before sending to trainer
Add float->uint8 conversion in addition to float->fp16 conversion in model_saver.
* [easy] improve unit test for sparse length sum ops
as desc.
#accept2ship
* Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24
* move sparse hash unique ops to OOS and add unit tests
- move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1
- The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2
- fix the CUDA UniqueOp for the case when batch is empty.
- add unit test
* group_norm_op for caffe2
This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494
This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel).
* Resubmit D7405233: disappeared in D7464958
OOS publish causes the op missing -- however, test was still there
* [c2] add sparse hash engine for cuda unique op
The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU.
* [dper][gpu] enable unit testing gpu trainer for sparse nn
to debug the GPU trainer using mock data in unit test.
make it easier to develop GPU trainer for new models.
* Reuse Gloo context for Synchronize() calls
Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts).
* [GanH/WGAN][1/n]: add FC param clipping
as titled
* [mobile] minimizing changes between caffe2_benchmark and speed_benchmark
* [GanH]: enable diagnose within model
avoid finding blob names but to directly enable inside the model
* Add `net_transformer_fun` option to DPM
This callback allows for various transformations to be made to the
model after gradient operators have been added. The immediate motivation for
this is to allow transformations such has "checkpoint-and-recompute" which
allow trading off memory for additional compute.
Adding several callbacks like this has made DPM's API less than ideal at this
stage. However, I could not find any reasonable alternative.
* [DT] [33/n] Compile flow task groups
task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary.
* Initial commit for sparse_normalize vectorization and benchmark
* [GanH]: LB Calibration for JSD
as titled
* Tracing event in async executor
Adding event tracing through TRACE_EVENT macro in async executor
* [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset
D7409751 got lost in D7464958
* Visualizing realtime weights values
we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index.
Currently, we assume the blob to be 2 dimensional.
* [GanH][Easy]: Fix Homotopy Weighting
apparantely, there was a bug in homotopy weight (alpha, beta) update
* [c2] move sparse hash unique op out of oss
so that oss do not need to depend on google hash map.
* Get rid of std::round as it's not supported on Android
* Revert changes on setup.py
* Skip shaky test on Dataio
* fix
2018-04-11 04:11:43 +00:00
|
|
|
seed=st.integers(min_value=0, max_value=100),
|
2018-03-26 18:14:26 +00:00
|
|
|
**hu.gcs
|
|
|
|
|
)
|
|
|
|
|
def test_sparse_lengths_weighted_sum(
|
[Caffe2] Changes done inside Facebook (#6378)
* fix unit test for sqrt op
From the error logging:
[idx, grad, grad_estimate] are:
[[ 146. 0.5 0.45776367]
[ 147. 0.5 0.45776367]
The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; )
The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss)
This diff
- increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :)
- also clean up, and merge the test case for inplace Vs. non-inplace
Tested with:
`CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"`
* CompositeReader & CompositeReaderBuilder
A new type of reader gluing multiple readers together.
* Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid"
Original commit changeset: 9325a4356dbe
* [dai][WIP] convert params to int8 on ps before sending to trainer
Add float->uint8 conversion in addition to float->fp16 conversion in model_saver.
* [easy] improve unit test for sparse length sum ops
as desc.
#accept2ship
* Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24
* move sparse hash unique ops to OOS and add unit tests
- move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1
- The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2
- fix the CUDA UniqueOp for the case when batch is empty.
- add unit test
* group_norm_op for caffe2
This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494
This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel).
* Resubmit D7405233: disappeared in D7464958
OOS publish causes the op missing -- however, test was still there
* [c2] add sparse hash engine for cuda unique op
The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU.
* [dper][gpu] enable unit testing gpu trainer for sparse nn
to debug the GPU trainer using mock data in unit test.
make it easier to develop GPU trainer for new models.
* Reuse Gloo context for Synchronize() calls
Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts).
* [GanH/WGAN][1/n]: add FC param clipping
as titled
* [mobile] minimizing changes between caffe2_benchmark and speed_benchmark
* [GanH]: enable diagnose within model
avoid finding blob names but to directly enable inside the model
* Add `net_transformer_fun` option to DPM
This callback allows for various transformations to be made to the
model after gradient operators have been added. The immediate motivation for
this is to allow transformations such has "checkpoint-and-recompute" which
allow trading off memory for additional compute.
Adding several callbacks like this has made DPM's API less than ideal at this
stage. However, I could not find any reasonable alternative.
* [DT] [33/n] Compile flow task groups
task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary.
* Initial commit for sparse_normalize vectorization and benchmark
* [GanH]: LB Calibration for JSD
as titled
* Tracing event in async executor
Adding event tracing through TRACE_EVENT macro in async executor
* [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset
D7409751 got lost in D7464958
* Visualizing realtime weights values
we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index.
Currently, we assume the blob to be 2 dimensional.
* [GanH][Easy]: Fix Homotopy Weighting
apparantely, there was a bug in homotopy weight (alpha, beta) update
* [c2] move sparse hash unique op out of oss
so that oss do not need to depend on google hash map.
* Get rid of std::round as it's not supported on Android
* Revert changes on setup.py
* Skip shaky test on Dataio
* fix
2018-04-11 04:11:43 +00:00
|
|
|
self, grad_on_weights, inputs, seed, gc, dc):
|
2018-03-26 18:14:26 +00:00
|
|
|
D, I, L = inputs
|
[Caffe2] Changes done inside Facebook (#6378)
* fix unit test for sqrt op
From the error logging:
[idx, grad, grad_estimate] are:
[[ 146. 0.5 0.45776367]
[ 147. 0.5 0.45776367]
The gradient == 0.5 is correct, which means the SqrtOp and its gradient is doing right job. (Because y = sqrt(x), loss = y^2/2 = x/2, and then d(loss)/dx = 1/2 = 0.5; )
The test failed because of numerical problem of grad_estimate (in unit test). It can be because the step_size is small, and float precision is not high (when there are multiple elements in the tensor, we do sum(y^2) to compute loss)
This diff
- increase the step size, and also move the test cases to be further away from 0 (where sqrt(x) is not well defined) to be safe :)
- also clean up, and merge the test case for inplace Vs. non-inplace
Tested with:
`CAFFE2_HYPOTHESIS_PROFILE=debug ai_bt caffe2/caffe2/python/operator_test:elementwise_ops_test -- "test_sqrt"`
* CompositeReader & CompositeReaderBuilder
A new type of reader gluing multiple readers together.
* Back out "Revert D7394363: [GanH]: Log D Trick for Cross Entropy with Sigmoid"
Original commit changeset: 9325a4356dbe
* [dai][WIP] convert params to int8 on ps before sending to trainer
Add float->uint8 conversion in addition to float->fp16 conversion in model_saver.
* [easy] improve unit test for sparse length sum ops
as desc.
#accept2ship
* Update GitHub upstream to 771fcb3455cbfe69c2abcc4cb3bd7ef92d59af24
* move sparse hash unique ops to OOS and add unit tests
- move the SparseHash version to OOS, since 'sparsehash' is already deps of caffe2 OOS: https://fburl.com/arssw4n1
- The 'SparseHash' engine is also being used in OOS, so the SparseHash version shall be in OOS to reduce confusion: https://fburl.com/o5ea7ah2
- fix the CUDA UniqueOp for the case when batch is empty.
- add unit test
* group_norm_op for caffe2
This is the cuda op for Group Normalization (GN): https://arxiv.org/abs/1803.08494
This code implements GN in one op that computes Y=gamma * (X-mu) / sigma + beta and also its gradients. It is expected to have minimal memory consumption (similar to the BN op), without creating new blobs if GN were implemented as several ops (e.g., reshape, norm_mean/std, affine_channel).
* Resubmit D7405233: disappeared in D7464958
OOS publish causes the op missing -- however, test was still there
* [c2] add sparse hash engine for cuda unique op
The SparseHash version of UniqueOp copy input tensor to CPU, and make use of sparse hash map to get unique output, and then copy back to GPU.
* [dper][gpu] enable unit testing gpu trainer for sparse nn
to debug the GPU trainer using mock data in unit test.
make it easier to develop GPU trainer for new models.
* Reuse Gloo context for Synchronize() calls
Previously we were creating (and leaking) the Gloo context on each call to Synchronize(). Now only run the common world op and create the barrier net once, then run the barrier net on each Synchronize() call. Since timeout is associated with the Gloo context, assert that the timeout is fixed instead of trying to handle the complexity of multiple timeouts (and associated contexts).
* [GanH/WGAN][1/n]: add FC param clipping
as titled
* [mobile] minimizing changes between caffe2_benchmark and speed_benchmark
* [GanH]: enable diagnose within model
avoid finding blob names but to directly enable inside the model
* Add `net_transformer_fun` option to DPM
This callback allows for various transformations to be made to the
model after gradient operators have been added. The immediate motivation for
this is to allow transformations such has "checkpoint-and-recompute" which
allow trading off memory for additional compute.
Adding several callbacks like this has made DPM's API less than ideal at this
stage. However, I could not find any reasonable alternative.
* [DT] [33/n] Compile flow task groups
task groups need to compiled in order to pickle the object in fblearner. However I also changed the Job's compile function as creating new object is not necessary.
* Initial commit for sparse_normalize vectorization and benchmark
* [GanH]: LB Calibration for JSD
as titled
* Tracing event in async executor
Adding event tracing through TRACE_EVENT macro in async executor
* [Resubmit] D7409751 Reseting book-keeping blobs when the reservoir is reset
D7409751 got lost in D7464958
* Visualizing realtime weights values
we want to visualize the weights values as optimizer is iterating. This diff supports to visual the weights at an assigned index.
Currently, we assume the blob to be 2 dimensional.
* [GanH][Easy]: Fix Homotopy Weighting
apparantely, there was a bug in homotopy weight (alpha, beta) update
* [c2] move sparse hash unique op out of oss
so that oss do not need to depend on google hash map.
* Get rid of std::round as it's not supported on Android
* Revert changes on setup.py
* Skip shaky test on Dataio
* fix
2018-04-11 04:11:43 +00:00
|
|
|
|
|
|
|
|
np.random.seed(int(seed))
|
|
|
|
|
|
2018-03-26 18:14:26 +00:00
|
|
|
W = np.random.rand(I.size).astype(np.float32)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsWeightedSum",
|
|
|
|
|
["D", "W", "I", "L"],
|
|
|
|
|
"out",
|
|
|
|
|
grad_on_weights=grad_on_weights)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [D, W, I, L], [0])
|
|
|
|
|
self.assertReferenceChecks(
|
|
|
|
|
device_option=gc,
|
|
|
|
|
op=op,
|
|
|
|
|
inputs=[D, W, I, L],
|
|
|
|
|
reference=sparse_lengths_weighted_sum_ref,
|
|
|
|
|
threshold=1e-4,
|
|
|
|
|
output_to_grad='out',
|
|
|
|
|
grad_reference=partial(
|
|
|
|
|
sparse_lengths_weighted_sum_grad_ref,
|
|
|
|
|
grad_on_weights=grad_on_weights),
|
|
|
|
|
)
|
|
|
|
|
self.assertGradientChecks(gc, op, [D, W, I, L], 0, [0])
|
|
|
|
|
if grad_on_weights:
|
|
|
|
|
self.assertGradientChecks(gc, op, [D, W, I, L], 1, [0])
|
2017-08-22 22:19:03 +00:00
|
|
|
|
2017-08-03 09:53:35 +00:00
|
|
|
@given(**hu.gcs)
|
|
|
|
|
def test_sparse_lengths_indices_in_gradient_sum_gpu(self, gc, dc):
|
|
|
|
|
X = np.random.rand(3, 3, 4, 5).astype(np.float32)
|
|
|
|
|
Y = np.asarray([3, 3, 2]).astype(np.int32)
|
|
|
|
|
Z = np.random.randint(0, 50, size=8).astype(np.int64)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsIndicesInGradientSumGradient", ["X", "Y", "Z"], "out"
|
|
|
|
|
)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
|
|
|
|
|
|
Update from facebook (#7696)
* Fix handling of empty batches in SumReduceDimsOp
As titled
* Deferrable async_scheduling finishRun fix
Proper order of finishing run operations in deferrable_async_scheduling net
* Simplify exception handling in async_scheduling
Simplify exception handling, no need to busy wait, thread that processes the
last task can finish the run
* [C2]worker_coordinator_memorize_worker_ids
As titled. This is related to T28689868, where the number of blobs we want to create is equal to the number of worker ids
* Add unit test for nets with no type set
* Ignore total length argument in sympolic_pad_packed_sequence
1- There was a mistake in the code that total_length was added to the wrong symbolic function (pack_padded_sequence) instead of (pad_packed_sequence)
2- No need to throw an exception if total_length is given since it is only used to enable data_parallel training on multi-gpus and doesn't have anything to do with onnx export, so just ignore it. https://fburl.com/tk4gciqp
* Add support for MKLDNN to async_scheduling
Just add MKLDNN as a possible CPU option to async_scheduling's pool function
* [AuFL][ensemble] support branch output for prediction
This diff supports using predictions from different branches and thus enables model ensembling (not fully independent).
* Fix a bug in add_loss in layer_model_helper
As titled.
* Support lradaption for adam
1.lr adaption operator
2.apply to dense adam
* Perf tweaks for async_scheduling
Restore single pool option + remove unnecessary (no-ops) calls
* add quantization to SparseSimdAdagradOp
add a bunch of quantization signatures to SparseSimdAdagradOp, implementations to come next
* [sr] [codemod] Change all SR callsites to use new API
@allow-large-files
This diff refactors all callsites of SR to use the slightly changed API introduced in the diff below. Really what this means is that you need to include the correct header. Also if you were using `ClientFactory::newFactory` you need to not prefix it with `ClientFactory::`.
```
cd ~/fbsource/fbcode
find ./ -type f -exec sed -i -e 's:#include "servicerouter/client/cpp2/ClientFactory.h":#include "servicerouter/client/cpp2/ServiceRouter.h":' -e 's:#include <servicerouter/client/cpp2/ClientFactory.h>:#include <servicerouter/client/cpp2/ServiceRouter.h>:' -e 's/ClientFactory::newFactory(/newFactory(/g' {} \;
```
Also manually fixed spots that couldn't be done automatically (or broke because they depended on transitive includes).
* Back out "Fix handling of empty batches in SumReduceDimsOp"
Original commit changeset: 282da1730cc2 This commit is blocking the
Github->fbcode sync, which really needs to get merged ASAP. D7881937 which this
diff depends on will be reverted in the sync D7990948 which causes this to
break. The sync diff cannot be patched with this reversion because it must be
landed against base revision 5c8c099 , and D7881937 must not be included in the
sync diff because it is breaking GPU tests that are not available in sandcastle
: https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-cuda8.0-cudnn6-ubuntu16.04-test/3638/console
for one example.
* Add the flow to support operator benchmark
1) generate model with the operator 2) upload to everstore 3) generate model spec into json file 4) start running the benchmark
* [tum][gpu] Connect DPM trainer with flow and unit tests
This diff:
- Fix some small bugs for Yiming's recent changes to parallelizer, so it suits real use cases.
- Add correct tags to the TUM code, so we can do data parallel transform
- pass extra info when instantiation.
- add unit test for using DPM in TUM model
After this diff, we can do simple box, multi-gpu fully-sync trainer for TUM in Fblearner workflow, but may still need to do speed benchmarking.
* w/o normalized lradaption for adam dense only
The previous lr adaption includes a normalization step when performing the dot product operation. This is not exactly same as what is proposed in the paper. I add normalization as an option. Without it, the operator performs exactly what the paper proposed. With the option, we add the normalization step
* [fb] Use SharedPromise in DeferrableAsyncSchedulingNet
This code is to simplify DeferrableAsyncSchedulingNet by removing condition
variable + small fixes
* [tum] implement cuda sparseLengthsMean and LengthsMean
as title
* Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function.
Adding an optional parameter to allow use of protobufs in InferShapesAndTypes function.
* Move feature_to_index to FeatureSpec.feature_to_index
move feature_to_index to FeatureSpec.feature_to_index to avoid override other fields
* [Caffe2] Rename bytes_moved to bytes_written
Just a rename in preparation for supporting bytes_read.
* [c2] fix ReduceFrontSumOp for empty case by setting 0
otherwise, it may use the results from last iteration when it's empty batch.
* [Caffe2] [Int8] Improve Intel CPU performance
* [Easy] Improve PrependDim op logging
as titled
* DBFileReader expand db_path using os.path.expanduser(..)
Since there are a lot of possible use cases of `DBFileReader` to read from user home path, like `~/local/sample.db`, I want to save people's trouble of calling `os.path.expanduser(db_path)` themselves.
* [Caffe2] Add bytes_read to cost structure
We're adding analytical read bytes to cost functions. This extends the structure accordingly for all CostInference defined operators.
Additionally, some small bug fixes were performed:
1) Cost functions now extract type information of operands instead of assuming float
* Fix sleef on aarch64 for hhvm
@bypass-lint
Rename flag
* Remove duplicated part in caffe2/ideep/operators/conv_op.cc
should be sync error
* Rename test helper function test_adagrad_sparse_helper to adagrad_sparse_test_helper to avoid confusing pytest
2018-05-20 06:10:48 +00:00
|
|
|
@given(**hu.gcs)
|
|
|
|
|
def test_sparse_lengths_indices_in_gradient_mean_gpu(self, gc, dc):
|
|
|
|
|
X = np.random.rand(3, 3, 4, 5).astype(np.float32)
|
|
|
|
|
Y = np.asarray([3, 3, 2]).astype(np.int32)
|
|
|
|
|
Z = np.random.randint(0, 50, size=8).astype(np.int64)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsIndicesInGradientMeanGradient", ["X", "Y", "Z"], "out"
|
|
|
|
|
)
|
|
|
|
|
self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
|
|
|
|
|
|
2017-08-03 09:53:35 +00:00
|
|
|
@given(**hu.gcs_cpu_only)
|
|
|
|
|
def test_legacy_sparse_and_lengths_sum_gradient(self, gc, dc):
|
2017-05-03 05:07:13 +00:00
|
|
|
X = np.random.rand(3, 64).astype(np.float32)
|
|
|
|
|
Y = np.asarray([20, 20, 10]).astype(np.int32)
|
2017-08-03 09:53:35 +00:00
|
|
|
workspace.FeedBlob("X", X)
|
|
|
|
|
workspace.FeedBlob("Y", Y)
|
|
|
|
|
test_net = core.Net("test_net")
|
|
|
|
|
test_net.SparseLengthsSumGradient(["X", "Y"], "out1")
|
|
|
|
|
test_net.LengthsSumGradient(["X", "Y"], "out2")
|
|
|
|
|
workspace.RunNetOnce(test_net)
|
|
|
|
|
out1 = workspace.FetchBlob("out1")
|
|
|
|
|
out2 = workspace.FetchBlob("out2")
|
|
|
|
|
self.assertTrue((out1 == out2).all())
|
2017-05-03 05:07:13 +00:00
|
|
|
|
2017-11-03 12:14:01 +00:00
|
|
|
@given(**hu.gcs)
|
2020-08-08 19:10:52 +00:00
|
|
|
@settings(deadline=10000)
|
2017-11-03 12:14:01 +00:00
|
|
|
def test_sparse_lengths_sum_invalid_index(self, gc, dc):
|
|
|
|
|
D = np.random.rand(50, 3, 4, 5).astype(np.float32)
|
|
|
|
|
I = (np.random.randint(0, 10000, size=10) + 10000).astype(np.int64)
|
|
|
|
|
L = np.asarray([4, 4, 2]).astype(np.int32)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsSum",
|
|
|
|
|
["D", "I", "L"],
|
|
|
|
|
"out")
|
|
|
|
|
workspace.FeedBlob('D', D)
|
|
|
|
|
workspace.FeedBlob('I', I)
|
|
|
|
|
workspace.FeedBlob('L', L)
|
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
|
|
|
workspace.RunOperatorOnce(op)
|
|
|
|
|
|
2018-09-19 16:58:15 +00:00
|
|
|
@serial.given(**hu.gcs_cpu_only)
|
2018-03-26 18:16:35 +00:00
|
|
|
def test_sparse_lengths_positional_weighted_sum(
|
|
|
|
|
self, gc, dc):
|
|
|
|
|
D = np.random.rand(50, 3, 4, 5).astype(np.float32)
|
|
|
|
|
W = np.random.rand(50).astype(np.float32)
|
|
|
|
|
indices = np.random.randint(0, 50, size=10).astype(np.int64)
|
|
|
|
|
L = np.asarray([4, 4, 2]).astype(np.int32)
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsPositionalWeightedSum",
|
|
|
|
|
["D", "W", "indices", "L"],
|
|
|
|
|
"out")
|
|
|
|
|
|
|
|
|
|
def ref_sparse(D, W, indices, L):
|
|
|
|
|
workspace.FeedBlob("L", L)
|
|
|
|
|
lengths_range_fill_op = core.CreateOperator(
|
|
|
|
|
"LengthsRangeFill", ["L"], ["L_pos_seq"])
|
|
|
|
|
workspace.RunOperatorOnce(lengths_range_fill_op)
|
|
|
|
|
|
|
|
|
|
workspace.FeedBlob("W", W)
|
|
|
|
|
gather_op = core.CreateOperator(
|
|
|
|
|
"Gather", ["W", "L_pos_seq"], ["W_gathered"])
|
|
|
|
|
workspace.RunOperatorOnce(gather_op)
|
|
|
|
|
|
|
|
|
|
workspace.FeedBlob("D", D)
|
|
|
|
|
workspace.FeedBlob("indices", indices)
|
|
|
|
|
sparse_op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsWeightedSum",
|
|
|
|
|
["D", "W_gathered", "indices", "L"],
|
|
|
|
|
"out_ref")
|
|
|
|
|
workspace.RunOperatorOnce(sparse_op)
|
|
|
|
|
|
|
|
|
|
return (workspace.FetchBlob("out_ref"),)
|
|
|
|
|
|
|
|
|
|
self.assertReferenceChecks(
|
|
|
|
|
gc, op, [D, W, indices, L], ref_sparse)
|
2018-03-26 18:14:26 +00:00
|
|
|
|
2020-09-15 17:44:23 +00:00
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
|
|
|
|
|
@given(
|
|
|
|
|
input=hu.tensor(min_dim=2, max_dim=2, max_value=20, dtype=np.float16),
|
|
|
|
|
data_strategy=st.data(),
|
|
|
|
|
is_mean=st.booleans(),
|
|
|
|
|
**hu.gcs
|
|
|
|
|
)
|
|
|
|
|
@settings(deadline=None)
|
|
|
|
|
def test_sparse_lengths_fp16(self, input, data_strategy, is_mean, gc, dc):
|
|
|
|
|
m = input.shape[0]
|
|
|
|
|
|
|
|
|
|
lengths = data_strategy.draw(
|
|
|
|
|
hu.tensor(
|
|
|
|
|
max_dim=1,
|
|
|
|
|
max_value=input.shape[0],
|
|
|
|
|
dtype=np.int32,
|
|
|
|
|
elements=st.integers(min_value=0, max_value=27),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
lengths_sum = int(np.sum(lengths).item())
|
|
|
|
|
|
|
|
|
|
indices = data_strategy.draw(
|
|
|
|
|
hu.arrays(
|
|
|
|
|
[lengths_sum], dtype=np.int64, elements=st.sampled_from(np.arange(m))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
if is_mean:
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsMean", ["input", "indices", "lengths"], "out"
|
|
|
|
|
)
|
|
|
|
|
self.assertReferenceChecks(gc, op, [input, indices, lengths], sparse_lengths_mean_ref)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
op = core.CreateOperator(
|
|
|
|
|
"SparseLengthsSum", ["input", "indices", "lengths"], "out"
|
|
|
|
|
)
|
|
|
|
|
self.assertReferenceChecks(gc, op, [input, indices, lengths], sparse_lengths_sum_ref)
|
|
|
|
|
|
2018-03-31 03:12:02 +00:00
|
|
|
# @given(
|
|
|
|
|
# inputs=hu.lengths_tensor(
|
|
|
|
|
# dtype=np.float32,
|
|
|
|
|
# min_value=1,
|
|
|
|
|
# max_value=5,
|
|
|
|
|
# min_dim=1,
|
|
|
|
|
# max_dim=1,
|
|
|
|
|
# allow_empty=False,
|
|
|
|
|
# ),
|
|
|
|
|
# **hu.gcs
|
|
|
|
|
# )
|
|
|
|
|
# def test_lengths_max_gpu(self, inputs, gc, dc):
|
|
|
|
|
# def lengths_max_ref(I, L):
|
|
|
|
|
# R = np.zeros(shape=(len(L)), dtype=I.dtype)
|
|
|
|
|
# line = 0
|
|
|
|
|
# for g in range(len(L)):
|
|
|
|
|
# for i in range(L[g]):
|
|
|
|
|
# if i == 0:
|
|
|
|
|
# R[g] = I[line]
|
|
|
|
|
# else:
|
|
|
|
|
# R[g] = max(R[g], I[line])
|
|
|
|
|
# line += 1
|
|
|
|
|
# return [R]
|
|
|
|
|
|
|
|
|
|
# X, lengths = inputs
|
|
|
|
|
# op = core.CreateOperator("LengthsMax", ["X", "lengths"], "out")
|
|
|
|
|
# self.assertDeviceChecks(dc, op, [X, lengths], [0])
|
|
|
|
|
# self.assertReferenceChecks(
|
|
|
|
|
# device_option=gc,
|
|
|
|
|
# op=op,
|
|
|
|
|
# inputs=[X, lengths],
|
|
|
|
|
# reference=lengths_max_ref,
|
|
|
|
|
# threshold=1e-4,
|
|
|
|
|
# output_to_grad='out',
|
|
|
|
|
# )
|
2018-03-30 22:36:34 +00:00
|
|
|
|
|
|
|
|
|
2016-11-23 02:31:47 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
import unittest
|
|
|
|
|
unittest.main()
|