onnxruntime/docs/python/examples/plot_convert_pipeline_vectorizer.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""
Train, convert and predict with ONNX Runtime
============================================

This example demonstrates an end to end scenario
starting with the training of a scikit-learn pipeline
which takes as inputs not a regular vector but a
dictionary ``{ int: float }`` as its first step is a
`DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_.

Train a pipeline
++++++++++++++++

The first step consists in creating a dummy datasets.
"""

import pandas
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(1000, n_targets=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train_dict = pandas.DataFrame(X_train[:, 1:]).T.to_dict().values()
X_test_dict = pandas.DataFrame(X_test[:, 1:]).T.to_dict().values()

####################################
# We create a pipeline.

from sklearn.ensemble import GradientBoostingRegressor  # noqa: E402
from sklearn.feature_extraction import DictVectorizer  # noqa: E402
from sklearn.pipeline import make_pipeline  # noqa: E402

pipe = make_pipeline(DictVectorizer(sparse=False), GradientBoostingRegressor())

pipe.fit(X_train_dict, y_train)

####################################
# We compute the prediction on the test set
# and we show the confusion matrix.
from sklearn.metrics import r2_score  # noqa: E402

pred = pipe.predict(X_test_dict)
print(r2_score(y_test, pred))

####################################
# Conversion to ONNX format
# +++++++++++++++++++++++++
#
# We use module
# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
# to convert the model into ONNX format.

from skl2onnx import convert_sklearn  # noqa: E402
from skl2onnx.common.data_types import DictionaryType, FloatTensorType, Int64TensorType  # noqa: E402

# initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
initial_type = [("float_input", DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
onx = convert_sklearn(pipe, initial_types=initial_type, target_opset=17)
with open("pipeline_vectorize.onnx", "wb") as f:
    f.write(onx.SerializeToString())

##################################
# We load the model with ONNX Runtime and look at
# its input and output.
import onnxruntime as rt  # noqa: E402
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument  # noqa: E402

sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers())

inp, out = sess.get_inputs()[0], sess.get_outputs()[0]
print(f"input name='{inp.name}' and shape={inp.shape} and type={inp.type}")
print(f"output name='{out.name}' and shape={out.shape} and type={out.type}")

##################################
# We compute the predictions.
# We could do that in one call:

try:
    sess.run([out.name], {inp.name: X_test_dict})[0]
except (RuntimeError, InvalidArgument) as e:
    print(e)

#############################
# But it fails because, in case of a DictVectorizer,
# ONNX Runtime expects one observation at a time.
pred_onx = [sess.run([out.name], {inp.name: row})[0][0, 0] for row in X_test_dict]

###############################
# We compare them to the model's ones.
print(r2_score(pred, pred_onx))

#########################
# Very similar. *ONNX Runtime* uses floats instead of doubles,
# that explains the small discrepencies.
Add a page in the documentation for every operator in onnxruntime (#14340) 2023-03-30 21:39:16 +00:00			`# Copyright (c) Microsoft Corporation. All rights reserved.`
			`# Licensed under the MIT License.`

			`"""`
			`Train, convert and predict with ONNX Runtime`
			`============================================`

			`This example demonstrates an end to end scenario`
			`starting with the training of a scikit-learn pipeline`
			`which takes as inputs not a regular vector but a`
			dictionary ``{ int: float }`` as its first step is a
			`DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_.

			`Train a pipeline`
			`++++++++++++++++`

			`The first step consists in creating a dummy datasets.`
			`"""`
Update win-ort-main to tip main 250116 (#23398) ### Description This PR is to update the win-ort-main branch to the tip main branch as of 2025-01-16. ### Motivation and Context This update includes the OpenVino fix for debug builds. --------- Signed-off-by: Liqun Fu <liqfu@microsoft.com> Signed-off-by: Liqun Fu <liqun.fu@microsoft.com> Signed-off-by: Junze Wu <junze.wu@intel.com> Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: Jianhui Dai <jianhui.j.dai@intel.com> Co-authored-by: Yueqing Zhang <yuz75@Pitt.edu> Co-authored-by: amancini-N <63410090+amancini-N@users.noreply.github.com> Co-authored-by: Adrian Lizarraga <adlizarraga@microsoft.com> Co-authored-by: liqun Fu <liqfu@microsoft.com> Co-authored-by: Guenther Schmuelling <guschmue@microsoft.com> Co-authored-by: Yifan Li <109183385+yf711@users.noreply.github.com> Co-authored-by: yf711 <yifanl@microsoft.com> Co-authored-by: Wanming Lin <wanming.lin@intel.com> Co-authored-by: wejoncy <wejoncy@163.com> Co-authored-by: wejoncy <wejoncy@.com> Co-authored-by: Scott McKay <skottmckay@gmail.com> Co-authored-by: Changming Sun <chasun@microsoft.com> Co-authored-by: Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com> Co-authored-by: Dmitry Deshevoy <mityada@gmail.com> Co-authored-by: xhcao <xinghua.cao@intel.com> Co-authored-by: Yueqing Zhang <yueqingz@amd.com> Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Co-authored-by: Jiajia Qin <jiajiaqin@microsoft.com> Co-authored-by: Wu, Junze <junze.wu@intel.com> Co-authored-by: Jian Chen <cjian@microsoft.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Matthieu Darbois <mayeut@users.noreply.github.com> Co-authored-by: Prathik Rao <prathik.rao@gmail.com> Co-authored-by: wonchung-microsoft <wonchung@microsoft.com> Co-authored-by: Vincent Wang <wangwchpku@outlook.com> Co-authored-by: PARK DongHa <luncliff@gmail.com> Co-authored-by: Hector Li <hecli@microsoft.com> Co-authored-by: Sam Webster <13457618+samwebster@users.noreply.github.com> Co-authored-by: Adrian Lizarraga <adrianlm2@gmail.com> Co-authored-by: Preetha Veeramalai <preetha.veeramalai@intel.com> Co-authored-by: jatinwadhwa921 <jatin.wadhwa@intel.com> Co-authored-by: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com> Co-authored-by: Corentin Maravat <101636442+cocotdf@users.noreply.github.com> Co-authored-by: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jie Chen <jie.a.chen@intel.com> Co-authored-by: Jianhui Dai <jianhui.j.dai@intel.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Baiju Meswani <bmeswani@microsoft.com> Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Co-authored-by: Justin Chu <justinchuby@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: Artur Wojcik <artur.wojcik@outlook.com> Co-authored-by: Ted Themistokleous <tedthemistokleous@amd.com> Co-authored-by: Xinya Zhang <Xinya.Zhang@amd.com> Co-authored-by: ikalinic <ilija.kalinic@amd.com> Co-authored-by: sstamenk <sstamenk@amd.com> Co-authored-by: Yi-Hong Lyu <yilyu@microsoft.com> Co-authored-by: Ti-Tai Wang <titaiwang@microsoft.com> 2025-01-16 23:20:25 +00:00
Add a page in the documentation for every operator in onnxruntime (#14340) 2023-03-30 21:39:16 +00:00			`import pandas`
			`from sklearn.datasets import make_regression`
			`from sklearn.model_selection import train_test_split`

			`X, y = make_regression(1000, n_targets=1)`

			`X_train, X_test, y_train, y_test = train_test_split(X, y)`
			`X_train_dict = pandas.DataFrame(X_train[:, 1:]).T.to_dict().values()`
			`X_test_dict = pandas.DataFrame(X_test[:, 1:]).T.to_dict().values()`

			`####################################`
			`# We create a pipeline.`

			`from sklearn.ensemble import GradientBoostingRegressor # noqa: E402`
			`from sklearn.feature_extraction import DictVectorizer # noqa: E402`
			`from sklearn.pipeline import make_pipeline # noqa: E402`

			`pipe = make_pipeline(DictVectorizer(sparse=False), GradientBoostingRegressor())`

			`pipe.fit(X_train_dict, y_train)`

			`####################################`
			`# We compute the prediction on the test set`
			`# and we show the confusion matrix.`
			`from sklearn.metrics import r2_score # noqa: E402`

			`pred = pipe.predict(X_test_dict)`
			`print(r2_score(y_test, pred))`

			`####################################`
			`# Conversion to ONNX format`
			`# +++++++++++++++++++++++++`
			`#`
			`# We use module`
			# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
			`# to convert the model into ONNX format.`

			`from skl2onnx import convert_sklearn # noqa: E402`
			`from skl2onnx.common.data_types import DictionaryType, FloatTensorType, Int64TensorType # noqa: E402`

			`# initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]`
			`initial_type = [("float_input", DictionaryType(Int64TensorType([1]), FloatTensorType([])))]`
			`onx = convert_sklearn(pipe, initial_types=initial_type, target_opset=17)`
			`with open("pipeline_vectorize.onnx", "wb") as f:`
			`f.write(onx.SerializeToString())`

			`##################################`
			`# We load the model with ONNX Runtime and look at`
			`# its input and output.`
			`import onnxruntime as rt # noqa: E402`
			`from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument # noqa: E402`

			`sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers())`

			`inp, out = sess.get_inputs()[0], sess.get_outputs()[0]`
			`print(f"input name='{inp.name}' and shape={inp.shape} and type={inp.type}")`
			`print(f"output name='{out.name}' and shape={out.shape} and type={out.type}")`

			`##################################`
			`# We compute the predictions.`
			`# We could do that in one call:`

			`try:`
			`sess.run([out.name], {inp.name: X_test_dict})[0]`
			`except (RuntimeError, InvalidArgument) as e:`
			`print(e)`

			`#############################`
			`# But it fails because, in case of a DictVectorizer,`
			`# ONNX Runtime expects one observation at a time.`
			`pred_onx = [sess.run([out.name], {inp.name: row})[0][0, 0] for row in X_test_dict]`

			`###############################`
			`# We compare them to the model's ones.`
			`print(r2_score(pred, pred_onx))`

			`#########################`
			`# Very similar. ONNX Runtime uses floats instead of doubles,`
			`# that explains the small discrepencies.`