Adding the Percentile op & UT

Reviewed By: MisterTea

Differential Revision: D6879507

fbshipit-source-id: 7ca4165a42c073e384d3a6138ef033ca384afd49
This commit is contained in:
Evgeny Kharitonov 2018-02-05 15:34:06 -08:00 committed by Facebook Github Bot
parent 3f0a99dc90
commit 7c7e09fe2d
3 changed files with 262 additions and 0 deletions

View file

@ -0,0 +1,132 @@
#include "caffe2/operators/percentile_op.h"
namespace caffe2 {
template <>
bool PercentileOp<CPUContext>::RunOnDevice() {
const auto& original_values = Input(X);
CAFFE_ENFORCE_EQ(original_values.ndim(), 2);
const auto num_examples = original_values.dim(0);
const float* original_values_data = original_values.template data<float>();
const auto num_features = original_values.dim(1);
const auto& value_pct_pairs = Input(VAL_PCT_PAIRS);
CAFFE_ENFORCE_EQ(value_pct_pairs.ndim(), 2);
CAFFE_ENFORCE_EQ(value_pct_pairs.dim(1), 2);
const int num_values = value_pct_pairs.dim(0);
const float* value_pct_data = value_pct_pairs.template data<float>();
const auto& lengths = Input(LENS);
const int* lengths_data = lengths.template data<int>();
CAFFE_ENFORCE_EQ(lengths.size(), num_features);
CAFFE_ENFORCE_EQ(
std::accumulate(lengths_data, lengths_data + lengths.size(), 0),
num_values,
"Sum of lengths should be equal to the total number of samples");
values_tensor.Resize(num_values);
percentiles_tensor.Resize(num_values);
float* values_tensor_data = values_tensor.template mutable_data<float>();
float* percentiles_tensor_data =
percentiles_tensor.template mutable_data<float>();
for (int ind = 0; ind < num_values; ind++) {
values_tensor_data[ind] = value_pct_data[2 * ind];
percentiles_tensor_data[ind] = value_pct_data[2 * ind + 1];
}
auto* percentile_values = Output(PCT);
percentile_values->ResizeLike(original_values);
float* percentile_values_data =
percentile_values->template mutable_data<float>();
int current_ind = 0;
int current_dist_start = 0;
int current_length;
for (int i = 0; i < num_examples; i++) {
current_dist_start = 0;
for (int j = 0; j < num_features; j++) {
current_length = lengths_data[j];
const auto lower_bound =
std::lower_bound(
values_tensor_data + current_dist_start,
values_tensor_data + current_dist_start + current_length,
original_values_data[current_ind]) -
values_tensor_data;
if (lower_bound == current_dist_start + current_length) {
percentile_values_data[current_ind] = 1.0;
} else if (
original_values_data[current_ind] ==
values_tensor_data[lower_bound]) {
percentile_values_data[current_ind] =
percentiles_tensor_data[lower_bound];
} else if (lower_bound == current_dist_start) {
percentile_values_data[current_ind] = 0.0;
} else {
float lower_pct = percentiles_tensor_data[lower_bound - 1];
float upper_pct = percentiles_tensor_data[lower_bound];
float interval_length = values_tensor_data[lower_bound] -
values_tensor_data[lower_bound - 1];
float normalized_dist_to_lower = (original_values_data[current_ind] -
values_tensor_data[lower_bound - 1]) /
interval_length;
percentile_values_data[current_ind] =
lower_pct + normalized_dist_to_lower * (upper_pct - lower_pct);
}
current_dist_start += current_length;
current_ind++;
}
}
return true;
}
REGISTER_CPU_OPERATOR(Percentile, PercentileOp<CPUContext>);
OPERATOR_SCHEMA(Percentile)
.NumInputs(3)
.NumOutputs(1)
.SetDoc(R"DOC(
This operator is used to find percentile representations for raw values, given a sample
set of raw values, labeled with their corresponding percentiles from the same distribution.
In particular, this operator takes as input a tensor of floats to find the percentile values
for, a 2D tensor of floats, where the first column of the tensor represents sampled values,
and the second column represents the percentile labels, and a tensor of integers lengths.
This lengths tensor is used because the operator works on multiple sets of raw values at the same time. For
example, for an input:
original_values=[[3, 5, 3],[5, 1, 6]], lengths = [2, 1, 1], value_to_pct = [[3, 0.2], [5, 0.5], [1, 0.3], [3. 0.6]]
Our operator expects that each column i of the input tensor is sampled from distribution i. Lengths tells
us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two,
and the last is from distribution 3. We expect the output of our operator to give us [[0.2, 1.0, 0.6], [0.5, 0.3, 1.0]].
To calculate the percentile of an element, we check to see if its value is already mapped to
a percentile in value_to_pct. If so, we return that value. If not, we linearly interpolate between
the two closest values in value_to_pct. If the value is larger than all values in value_to_pct, we
return 1. If it's smaller than all the values, we return 0.
)DOC")
.Input(
0,
"original_values",
"Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.")
.Input(
1,
"value_to_pct",
"Sorted 2D tensor, with 2 columns. Each element in the first column is a float representing the"
" raw value of a sample. Its corresponding element in the next column represents the percentile it maps to.")
.Input(
2,
"lengths",
"1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor"
" is equal to the total length of value_to_pct.")
.Output(
0,
"percentile_values",
"1D tensor of floats, with the same dimensions as the flattened input tensor. Each element "
"of this tensor, percentile_values[i], corresponds to the percentile calculated "
"for original_values[i].");
NO_GRADIENT(Percentile);
} // namespace caffe2

View file

@ -0,0 +1,34 @@
// Operator to calculate percentile values for an input tensor of data,
// given samples of data from the same distribution, labeled with their
// percentile values.
#ifndef CAFFE2_OPERATORS_PERCENTILE_OP_H_
#define CAFFE2_OPERATORS_PERCENTILE_OP_H_
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <class Context>
class PercentileOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
PercentileOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws) {}
bool RunOnDevice() override;
protected:
INPUT_TAGS(X, VAL_PCT_PAIRS, LENS);
OUTPUT_TAGS(PCT);
Tensor<Context> values_tensor;
Tensor<Context> percentiles_tensor;
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_PERCENTILE_OP_H_

View file

@ -0,0 +1,96 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace, dyndep
import caffe2.python.hypothesis_test_util as hu
import numpy as np
class TestPercentileOp(hu.HypothesisTestCase):
def _test_percentile_op(
self,
original_inp,
value_to_pct_map,
dist_lengths,
expected_values
):
op = core.CreateOperator(
'Percentile',
['original_values', 'value_to_pct_map', 'dist_lengths'],
['percentile_values']
)
workspace.FeedBlob('original_values', np.array(original_inp, dtype=np.float32))
workspace.FeedBlob(
'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
workspace.FeedBlob('dist_lengths', np.array(dist_lengths, dtype=np.int32))
workspace.RunOperatorOnce(op)
np.testing.assert_array_almost_equal(
workspace.FetchBlob('percentile_values'),
np.array(expected_values),
decimal=5
)
def test_percentile_op_with_only_one_dist(self):
self._test_percentile_op(
original_inp=[[5]],
value_to_pct_map=[[5, 0.4]],
dist_lengths=[1],
expected_values=[[0.4]]
)
def test_percentile_op_with_all_elements_in_map(self):
self._test_percentile_op(
original_inp=[[3, 4], [10, 4]],
value_to_pct_map=[[3, 0.3], [4, 0.6], [10, 0.8], [4, 0.5], [5, 0.6]],
dist_lengths=[3, 2],
expected_values=[[0.3, 0.5], [0.8, 0.5]],
)
def test_percentile_op_with_same_value(self):
self._test_percentile_op(
original_inp=[[1, 1], [1, 2]],
value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.5]],
dist_lengths=[2, 1],
expected_values=[[0.1, 0.0], [0.1, 0.5]]
)
def test_percentile_op_with_elements_bigger_than_map_range(self):
self._test_percentile_op(
original_inp=[[1, 5], [3, 4]],
value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.1], [3, 0.3]],
dist_lengths=[2, 2],
expected_values=[[0.1, 1.], [0.3, 1.0]]
)
def test_percentile_op_with_elements_smaller_than_map_range(self):
self._test_percentile_op(
original_inp=[[1], [5], [6]],
value_to_pct_map=[[2, 0.2], [5, 0.5], [7, 0.5]],
dist_lengths=[3],
expected_values=[[0.0], [0.5], [0.5]]
)
def test_percentile_op_with_interpolation(self):
self._test_percentile_op(
original_inp=[[3, 2, 5], [6, 7, 8]],
value_to_pct_map=[[1, 0.1], [4, 0.7], [4.5, 0.8],
[6, 0.5], [8, 0.9],
[8, 0.6]],
dist_lengths=[3, 2, 1],
expected_values=[[0.5, 0.0, 0.0], [1.0, 0.7, 0.6]]
)
def test_percentile_op_with_large_sample_size_per_dist(self):
self._test_percentile_op(
original_inp=[[3, 1], [5, 7]],
value_to_pct_map=[[3, 0.5], [4, 0.6], [5, 0.7],
[1, 0.2], [2, 0.3], [5, 0.8]],
dist_lengths=[3, 3],
expected_values=[[0.5, 0.2], [0.7, 1.0]]
)
if __name__ == "__main__":
import unittest
unittest.main()