Add DataFrameTool (#2456)

Add DataFrameTool to feed inputs from Panda DataFrame
2026-06-27 03:11:28 +00:00 · 2019-12-02 10:12:03 -08:00 · 2019-12-02 10:12:03 -08:00 · ec88f6d8d6
commit ec88f6d8d6
parent 89824b35e9
4 changed files with 291 additions and 0 deletions
--- a/onnxruntime/python/tools/automl/README.md
+++ b/onnxruntime/python/tools/automl/README.md
@ -0,0 +1,27 @@
+# DataFrameTool overview
+
+This tool helps to feed data from an an instance of pandas DataFrame to a loaded ONNX model using ONNX Runtime API.
+
+## Example of usage
+
+See example of usage in feed_inputs_test.py in the same directory.
+
+```python
+import onnxruntime as onnxrt
+import numpy as np
+import pandas as pd
+
+from feed_inputs import DataFrameTool
+
+# Load the onnx model
+sess_options = onnxrt.SessionOptions()
+sess_options.enable_profiling = args.profile
+sess = onnxrt.InferenceSession(args.model_path, sess_options)
+
+df = pd.DataFrame([['string_input', True, np.float32(0.25)]], index=[0], columns=['F2', 'Label', 'F1'])
+
+feed_helper = DataFrameTool(sess)
+feeds = feed_helper.feed_nputs(df)
+
+sess.run([], feeds)
+```
--- a/onnxruntime/python/tools/automl/create_test_model.py
+++ b/onnxruntime/python/tools/automl/create_test_model.py
@ -0,0 +1,91 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import onnx
+import numpy as np
+import os
+import sys
+import argparse
+from onnx import numpy_helper
+from onnx import helper
+from onnx import utils
+from onnx import AttributeProto, TensorProto, GraphProto
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_file", required=True, help="Model file name to save")
+    return parser.parse_args()
+
+def create_model():
+     """
+     This function creates a test feed model that consists of a single node that takes
+     Tensors of all inputs
+     """
+     args = parse_arguments()
+
+     # bool_identity
+     bool_input = helper.make_tensor_value_info('BoolInput', TensorProto.BOOL, [1,1])
+     # Create output for Identity
+     bool_output = helper.make_tensor_value_info('BoolOutput', TensorProto.BOOL, [1,1])
+     # Create node def
+     bool_identity_def = helper.make_node('Identity', inputs=['BoolInput'], outputs=['BoolOutput'], name='BoolIdentity')
+
+     # Create string_identity
+     string_input = helper.make_tensor_value_info('StringInput', TensorProto.STRING, [1,1])
+     string_output = helper.make_tensor_value_info('StringOutput', TensorProto.STRING, [1,1])
+     string_identity_def = helper.make_node('Identity', inputs=['StringInput'], outputs=['StringOutput'], name='StringIdentity')
+
+     # double
+     double_input = helper.make_tensor_value_info('DoubleInput', TensorProto.DOUBLE, [1,1])
+     double_output = helper.make_tensor_value_info('DoubleOutput', TensorProto.DOUBLE, [1,1])
+     double_identity_def = helper.make_node('Identity', inputs=['DoubleInput'], outputs=['DoubleOutput'], name='DoubleIdentity')
+
+     # int8
+     int8_input = helper.make_tensor_value_info('Int8Input', TensorProto.INT8, [1,1])
+     int8_output = helper.make_tensor_value_info('Int8Output', TensorProto.INT8, [1,1])
+     int8_identity_def = helper.make_node('Identity', inputs=['Int8Input'], outputs=['Int8Output'], name='Int8Identity')
+
+     # int16
+     int16_input = helper.make_tensor_value_info('Int16Input', TensorProto.INT16, [1,1])
+     int16_output = helper.make_tensor_value_info('Int16Output', TensorProto.INT16, [1,1])
+     int16_identity_def = helper.make_node('Identity', inputs=['Int16Input'], outputs=['Int16Output'], name='Int16Identity')
+
+     # int32
+     int32_input = helper.make_tensor_value_info('Int32Input', TensorProto.INT32, [1,1])
+     int32_output = helper.make_tensor_value_info('Int32Output', TensorProto.INT32, [1,1])
+     int32_identity_def = helper.make_node('Identity', inputs=['Int32Input'], outputs=['Int32Output'], name='Int32Identity')
+
+     # int64
+     int64_input = helper.make_tensor_value_info('Int64Input', TensorProto.INT64, [1,1])
+     int64_output = helper.make_tensor_value_info('Int64Output', TensorProto.INT64, [1,1])
+     int64_identity_def = helper.make_node('Identity', inputs=['Int64Input'], outputs=['Int64Output'], name='Int64Identity')
+
+     ##### Optional input as it has initializer. This one is interesting bc it needs float32 which
+     # Pandas do not have
+     # Create Initializer with optional input with default value from the initializer
+     float32_input = helper.make_tensor_value_info('Float32Input', TensorProto.FLOAT, [1,1])
+     float32_output = helper.make_tensor_value_info('Float32Output', TensorProto.FLOAT, [1,1])
+     optional_identity_def = helper.make_node('Identity', inputs=['Float32Input'], outputs=['Float32Output'], name='OptionalIdentity')
+
+     # Create a default initializer for float32_input.
+     tensor_float32 = helper.make_tensor(name='Float32Input', data_type=TensorProto.FLOAT, dims=[1,1],
+                                        vals=np.array([[.0]]).astype(np.float32), raw=False)
+
+     # Make a graph
+     graph_def = helper.make_graph(nodes=[bool_identity_def, string_identity_def, double_identity_def, int8_identity_def,
+                                          int16_identity_def, int32_identity_def, int64_identity_def, optional_identity_def],
+                                   name='optional_input_graph',
+                                   inputs=[bool_input, string_input, double_input, int8_input, int16_input, int32_input, int64_input, float32_input],
+                                   outputs=[bool_output, string_output, double_output, int8_output, int16_output, int32_output, int64_output, float32_output],
+                                   initializer=[tensor_float32])
+
+     model_def = helper.make_model(graph_def, producer_name='feed_inputs_test')
+     final_model = onnx.utils.polish_model(model_def)
+     onnx.save(final_model, args.output_file)
+
+if __name__ == "__main__":
+    sys.exit(create_model())
+
+
--- a/onnxruntime/python/tools/automl/data_frame_tool.py
+++ b/onnxruntime/python/tools/automl/data_frame_tool.py
@ -0,0 +1,138 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import numpy as np
+import onnxruntime as onnxrt
+
+ort_float_set = set([np.float32, np.float64])
+
+pd_float_set = set(['float64'])
+
+ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64])
+
+pd_int_set = set(['int64'])
+
+types_dict = {
+    'tensor(float16)': np.float16,
+    'tensor(float)'  : np.float32,
+    'tensor(double)' : np.float64,
+
+    'tensor(int8)'   : np.int8,
+    'tensor(uint8)'  : np.uint8,
+    'tensor(int16)'  : np.int16,
+    'tensor(uint16)' : np.uint16,
+    'tensor(int32)'  : np.int32,
+    'tensor(uint32)' : np.uint32,
+    'tensor(int64)'  : np.int64,
+    'tensor(uint64)' : np.uint64,
+
+    'tensor(bool)'   : np.bool,
+    'tensor(string)' : np.object
+}
+
+class DataFrameTool():
+    """
+    This is a utility class used to run a model with pandas.DataFrame input
+    """
+    def __init__(self, model_path, sess_options=None):
+        """
+        :param model_path: path to the model to be loaded
+        :param sess_options: see onnxruntime.SessionsOptions
+        """
+        self._model_path = model_path
+        self._sess_options = sess_options
+        self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options)
+
+    def _process_input_list(self, df, input_metas, require):
+        """
+        Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta
+        The function does the heavy lifting for _get_input_feeds()
+
+        :param df: See :class:`pandas.DataFrame`. 
+        :param input_metas: a list of name/type pairs
+        :require is a boolean. If True this helper throws on a missing input.
+        
+        """
+        feeds = {}
+        # Process mandadory inputs. Raise an error if anything is not present
+        for input_meta in input_metas:
+            shape = [dim if dim else 1 for dim in input_meta.shape]
+            # We fully expect all the types are in the above dictionary
+            assert input_meta.type in types_dict, "Update types_dict for the new type"
+            if input_meta.name in df.columns:
+                expected_type = types_dict[input_meta.type]
+                # float16 and bool will always require exact match
+                # We attempt to convert any type to a string if it is required.
+                # With strings we always want to put this into a flat array, cast to np.object and then reshape as object
+                if input_meta.type == 'tensor(string)':
+                    #print('Col: {} processed as string type: {} '.format(input_meta.name, df[input_meta.name].dtype))
+                    feeds[input_meta.name] = np.array([df[input_meta.name][0]]).astype(expected_type).reshape(shape)
+                elif expected_type == df[input_meta.name].dtype: # If there is an exact match we take as is
+                    #print('Col: {} processed exact match type: {} '.format(input_meta.name, df[input_meta.name].dtype))
+                    feeds[input_meta.name] = np.array([df[input_meta.name][0]]).astype(expected_type).reshape(shape)
+                elif expected_type in ort_float_set and str(df[input_meta.name].dtype) in pd_float_set:
+                    #print('Col: {} processed as floating type: {} '.format(input_meta.name, df[input_meta.name].dtype))
+                    feeds[input_meta.name] = np.array([df[input_meta.name][0]]).astype(expected_type).reshape(shape)
+                elif expected_type in ort_int_set and str(df[input_meta.name].dtype) in pd_int_set:
+                    #print('Col: {} processed as integer type: {} '.format(input_meta.name, df[input_meta.name].dtype))
+                    feeds[input_meta.name] = np.array([df[input_meta.name][0]]).astype(expected_type).reshape(shape)
+                else:
+                    raise TypeError("Input {} expected to be of type: {} got {} ".format(
+                                input_meta.name, expected_type, df[input_meta.name].dtype))
+            elif require:
+                raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format(
+                               input_meta.name, types_dict[input_meta.type]))
+        return feeds
+
+
+    def _get_input_feeds(self, df, sess):
+        """
+        Return a dictionary of input_name : a typed and shaped np.array of values
+        This function accepts Pandas DataFrame as the first argument and onnxruntime
+        session with a loaded model. The function interrogates the model for the inputs
+        and matches the model input names to the DataFrame instance column names.
+        It requires exact matches for bool and float16 types. It attempts to convert to
+        string any input type if string is required.
+        It attempts to convert floating types to each other and does the same for all of the
+        integer types without requiring an exact match.
+
+        :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column
+            and feeds the data to the appropriate model inputs.
+
+        :param sess: See :class:`onnxruntime.InferenceSession`.
+        
+        ::
+        For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C'])
+
+        """
+        if df.empty:
+            raise RuntimeError('input DataFrame is empty')
+
+        # Process mandadory inputs. Raise an error if anything is not present
+        feeds = self._process_input_list(df, sess.get_inputs(), True)
+        # Process optional overridable initializers. If present the initialzier value
+        # is overriden by the input. If not, the initialzier value embedded in the model takes effect.
+        initializers = self._process_input_list(df, sess.get_overridable_initializers(), False)
+
+        feeds.update(initializers)
+
+        return feeds
+
+    def execute(self, df, output_names, run_options=None):
+        "Return a list of output values restricted to output names if not empty"
+        """
+        Compute the predictions.
+
+        :param df: See :class:`pandas.DataFrame`.
+        :param output_names: name of the outputs that we are interested in
+        :param run_options: See :class:`onnxruntime.RunOptions`.
+
+        ::
+
+        sess.run([output_name], {input_name: x})
+        """
+        input_feed = self._get_input_feeds(df, self._sess);
+        return self._sess.run(output_names, input_feed, run_options)
+
--- a/onnxruntime/python/tools/automl/data_frame_tool_test.py
+++ b/onnxruntime/python/tools/automl/data_frame_tool_test.py
@ -0,0 +1,35 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+import argparse
+import onnxruntime as onnxrt
+import numpy as np
+import pandas as pd
+from data_frame_tool import DataFrameTool
+import os
+import sys
+
+def main():
+    parser = argparse.ArgumentParser(description='Test Feed Inputs utility')
+    parser.add_argument('model_path', help='model path')
+    parser.add_argument('-profile', action='store_true',
+                        help='enable chrome timeline trace profiling.')
+    args = parser.parse_args()
+
+    # Create options and the tool
+    sess_options = onnxrt.SessionOptions()
+    sess_options.enable_profiling = args.profile
+
+    df_tool = DataFrameTool(args.model_path, sess_options)
+
+    # Create a DataFrame that holds 3 inputs, string, bool, float in their respective columns
+    df = pd.DataFrame([['string_input', 3.25, 8, 16, 32, 64, True, 0.25]], 
+                      columns=['StringInput', 'DoubleInput', 'Int8Input', 'Int16Input', 'Int32Input', 'Int64Input', 'BoolInput', 'Float32Input'])
+
+    outputs = df_tool.execute(df, [])
+    print('Outputs: ', outputs)
+
+if __name__ == "__main__":
+    sys.exit(main())