from typing import Dict, Union, Any
import mlflow
import hashlib
import json
import os
from contextlib import contextmanager
from mlflow.exceptions import MlflowException
from mlflow.utils.file_utils import TempDir
from mlflow.entities import RunTag
from mlflow.tracking.artifact_utils import _download_artifact_from_uri
from mlflow.utils import _get_fully_qualified_class_name
from mlflow.utils.class_utils import _get_class_from_string
from mlflow.utils.annotations import experimental
import logging
import struct
import sys
import math
from collections import OrderedDict
from abc import ABCMeta, abstractmethod
_logger = logging.getLogger(__name__)
[docs]class EvaluationArtifact(metaclass=ABCMeta):
"""
A model evaluation artifact containing an artifact uri and content.
"""
def __init__(self, uri, content=None):
self._uri = uri
self._content = content
@abstractmethod
def _load_content_from_file(self, local_artifact_path):
"""
Abstract interface to load the content from local artifact file path,
and return the loaded content.
"""
pass
def _load(self, local_artifact_path=None):
"""
If ``local_artifact_path`` is ``None``, download artifact from the artifact uri.
Otherwise, load artifact content from the specified path. Assign the loaded content to
``self._content``, and return the loaded content.
"""
if local_artifact_path is not None:
self._content = self._load_content_from_file(local_artifact_path)
else:
with TempDir() as temp_dir:
temp_dir_path = temp_dir.path()
_download_artifact_from_uri(self._uri, temp_dir_path)
local_artifact_file = temp_dir.path(os.listdir(temp_dir_path)[0])
self._content = self._load_content_from_file(local_artifact_file)
return self._content
@abstractmethod
def _save(self, output_artifact_path):
"""Save artifact content into specified path."""
pass
@property
def content(self):
"""
The content of the artifact (representation varies)
"""
if self._content is None:
self._load()
return self._content
@property
def uri(self) -> str:
"""
The URI of the artifact
"""
return self._uri
def __repr__(self):
return f"{self.__class__.__name__}(uri='{self.uri}')"
[docs]class EvaluationResult:
"""
Represents the model evaluation outputs of a `mlflow.evaluate()` API call, containing
both scalar metrics and output artifacts such as performance plots.
"""
def __init__(self, metrics, artifacts):
self._metrics = metrics
self._artifacts = artifacts
[docs] @classmethod
def load(cls, path):
"""Load the evaluation results from the specified local filesystem path"""
with open(os.path.join(path, "metrics.json"), "r") as fp:
metrics = json.load(fp)
with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
artifacts_metadata = json.load(fp)
artifacts = {}
artifacts_dir = os.path.join(path, "artifacts")
for artifact_name, meta in artifacts_metadata.items():
uri = meta["uri"]
ArtifactCls = _get_class_from_string(meta["class_name"])
artifact = ArtifactCls(uri=uri)
artifact._load(os.path.join(artifacts_dir, artifact_name))
artifacts[artifact_name] = artifact
return EvaluationResult(metrics=metrics, artifacts=artifacts)
[docs] def save(self, path):
"""Write the evaluation results to the specified local filesystem path"""
os.makedirs(path, exist_ok=True)
with open(os.path.join(path, "metrics.json"), "w") as fp:
json.dump(self.metrics, fp)
artifacts_metadata = {
artifact_name: {
"uri": artifact.uri,
"class_name": _get_fully_qualified_class_name(artifact),
}
for artifact_name, artifact in self.artifacts.items()
}
with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
json.dump(artifacts_metadata, fp)
artifacts_dir = os.path.join(path, "artifacts")
os.mkdir(artifacts_dir)
for artifact_name, artifact in self.artifacts.items():
artifact._save(os.path.join(artifacts_dir, artifact_name))
@property
def metrics(self) -> Dict[str, Any]:
"""
A dictionary mapping scalar metric names to scalar metric values
"""
return self._metrics
@property
def artifacts(self) -> Dict[str, "mlflow.models.EvaluationArtifact"]:
"""
A dictionary mapping standardized artifact names (e.g. "roc_data") to
artifact content and location information
"""
return self._artifacts
_cached_mlflow_client = None
def _hash_uint64_ndarray_as_bytes(array):
assert len(array.shape) == 1
# see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
return struct.pack(f">{array.size}Q", *array)
def _hash_ndarray_as_bytes(nd_array):
from pandas.util import hash_array
import numpy as np
return _hash_uint64_ndarray_as_bytes(
hash_array(nd_array.flatten(order="C"))
) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64"))
def _hash_array_like_obj_as_bytes(data):
"""
Helper method to convert pandas dataframe/numpy array/list into bytes for
MD5 calculation purpose.
"""
from pandas.util import hash_pandas_object
import numpy as np
import pandas as pd
if isinstance(data, pd.DataFrame):
# add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
# run code not related to pyspark.
if "pyspark" in sys.modules:
from pyspark.ml.linalg import Vector as spark_vector_type
else:
spark_vector_type = None
def _hash_array_like_element_as_bytes(v):
if spark_vector_type is not None:
if isinstance(v, spark_vector_type):
return _hash_ndarray_as_bytes(v.toArray())
if isinstance(v, np.ndarray):
return _hash_ndarray_as_bytes(v)
if isinstance(v, list):
return _hash_ndarray_as_bytes(np.array(v))
return v
data = data.applymap(_hash_array_like_element_as_bytes)
return _hash_uint64_ndarray_as_bytes(hash_pandas_object(data))
elif isinstance(data, np.ndarray):
return _hash_ndarray_as_bytes(data)
elif isinstance(data, list):
return _hash_ndarray_as_bytes(np.array(data))
else:
raise ValueError("Unsupported data type.")
def _gen_md5_for_arraylike_obj(md5_gen, data):
"""
Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
- array length
- first NUM_SAMPLE_ROWS_FOR_HASH rows content
- last NUM_SAMPLE_ROWS_FOR_HASH rows content
"""
import numpy as np
len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
md5_gen.update(len_bytes)
if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
md5_gen.update(_hash_array_like_obj_as_bytes(data))
else:
head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
md5_gen.update(_hash_array_like_obj_as_bytes(head_rows))
md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows))
class EvaluationDataset:
"""
An input dataset for model evaluation. This is intended for use with the
:py:func:`mlflow.models.evaluate()`
API.
"""
NUM_SAMPLE_ROWS_FOR_HASH = 5
SPARK_DATAFRAME_LIMIT = 10000
def __init__(self, data, *, targets, name=None, path=None, feature_names=None):
"""
The values of the constructor arguments comes from the `evaluate` call.
"""
import numpy as np
import pandas as pd
if name is not None and '"' in name:
raise ValueError(f'Dataset name cannot include a double quote (") but got {name}')
if path is not None and '"' in path:
raise ValueError(f'Dataset path cannot include a double quote (") but got {path}')
self._user_specified_name = name
self._path = path
self._hash = None
try:
# add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
# run code not related to pyspark.
if "pyspark" in sys.modules:
from pyspark.sql import DataFrame as SparkDataFrame
supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
spark_df_type = SparkDataFrame
else:
supported_dataframe_types = (pd.DataFrame,)
spark_df_type = None
except ImportError:
supported_dataframe_types = (pd.DataFrame,)
if feature_names is not None and len(set(feature_names)) < len(list(feature_names)):
raise ValueError(
"`feature_names` argument must be a list containing unique feature names."
)
if isinstance(data, (np.ndarray, list)):
if not isinstance(targets, (np.ndarray, list)):
raise ValueError(
"If data is a numpy array or list of evaluation features, "
"`targets` argument must be a numpy array or list of evaluation labels."
)
if isinstance(data, list):
data = np.array(data)
if len(data.shape) != 2:
raise ValueError(
"If the `data` argument is a numpy array, it must be a 2 dimension array "
"and second dimension represent the number of features. If the `data` "
"argument is a list, each of its element must be a feature array of "
"numpy array or list and all element must has the same length."
)
self._features_data = data
self._labels_data = targets if isinstance(targets, np.ndarray) else np.array(targets)
if len(self._features_data) != len(self._labels_data):
raise ValueError(
"The input features example rows must be the same length with labels array."
)
num_features = data.shape[1]
if feature_names is not None:
feature_names = list(feature_names)
if num_features != len(feature_names):
raise ValueError("feature name list must be the same length with feature data.")
self._feature_names = feature_names
else:
self._feature_names = [
f"feature_{str(i + 1).zfill(math.ceil((math.log10(num_features + 1))))}"
for i in range(num_features)
]
elif isinstance(data, supported_dataframe_types):
if not isinstance(targets, str):
raise ValueError(
"If data is a Pandas DataFrame or Spark DataFrame, `targets` argument must "
"be the name of the column which contains evaluation labels in the `data` "
"dataframe."
)
if isinstance(data, spark_df_type):
_logger.warning(
"Specified Spark DataFrame is too large for model evaluation. Only "
f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
"If you want evaluate on the whole spark dataframe, please manually call "
"`spark_dataframe.toPandas()`."
)
data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
self._labels_data = data[targets].to_numpy()
if feature_names is not None:
self._features_data = data[list(feature_names)]
self._feature_names = feature_names
else:
self._features_data = data.drop(targets, axis=1, inplace=False)
self._feature_names = list(self._features_data.columns)
else:
raise ValueError(
"The data argument must be a numpy array, a list or a Pandas DataFrame, or "
"spark DataFrame if pyspark package installed."
)
# generate dataset hash
md5_gen = hashlib.md5()
_gen_md5_for_arraylike_obj(md5_gen, self._features_data)
_gen_md5_for_arraylike_obj(md5_gen, self._labels_data)
md5_gen.update(",".join(self._feature_names).encode("UTF-8"))
self._hash = md5_gen.hexdigest()
@property
def feature_names(self):
return self._feature_names
@property
def features_data(self):
"""
return features data as a numpy array or a pandas DataFrame.
"""
return self._features_data
@property
def labels_data(self):
"""
return labels data as a numpy array
"""
return self._labels_data
@property
def name(self):
"""
Dataset name, which is specified dataset name or the dataset hash if user don't specify
name.
"""
return self._user_specified_name if self._user_specified_name is not None else self.hash
@property
def path(self):
"""
Dataset path
"""
return self._path
@property
def hash(self):
"""
Dataset hash, includes hash on first 20 rows and last 20 rows.
"""
return self._hash
@property
def _metadata(self):
"""
Return dataset metadata containing name, hash, and optional path.
"""
metadata = {
"name": self.name,
"hash": self.hash,
}
if self.path is not None:
metadata["path"] = self.path
return metadata
def _log_dataset_tag(self, client, run_id, model_uuid):
"""
Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
append current dataset metadata into existing tag content.
"""
existing_dataset_metadata_str = client.get_run(run_id).data.tags.get(
"mlflow.datasets", "[]"
)
dataset_metadata_list = json.loads(existing_dataset_metadata_str)
for metadata in dataset_metadata_list:
if (
metadata["hash"] == self.hash
and metadata["name"] == self.name
and metadata["model"] == model_uuid
):
break
else:
dataset_metadata_list.append({**self._metadata, "model": model_uuid})
dataset_metadata_str = json.dumps(dataset_metadata_list, separators=(",", ":"))
client.log_batch(
run_id,
tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
)
def __hash__(self):
return hash(self.hash)
def __eq__(self, other):
import numpy as np
if not isinstance(other, EvaluationDataset):
return False
if isinstance(self._features_data, np.ndarray):
is_features_data_equal = np.array_equal(self._features_data, other._features_data)
else:
is_features_data_equal = self._features_data.equals(other._features_data)
return (
is_features_data_equal
and np.array_equal(self._labels_data, other._labels_data)
and self.name == other.name
and self.path == other.path
and self._feature_names == other._feature_names
)
class ModelEvaluator(metaclass=ABCMeta):
@abstractmethod
def can_evaluate(self, *, model_type, evaluator_config, **kwargs) -> bool:
"""
:param model_type: A string describing the model type (e.g., "regressor", "classifier", …).
:param evaluator_config: A dictionary of additional configurations for
the evaluator.
:param kwargs: For forwards compatibility, a placeholder for additional arguments
that may be added to the evaluation interface in the future.
:return: True if the evaluator can evaluate the specified model on the
specified dataset. False otherwise.
"""
raise NotImplementedError()
@abstractmethod
def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
"""
The abstract API to log metrics and artifacts, and return evaluation results.
:param model: A pyfunc model instance.
:param model_type: A string describing the model type
(e.g., ``"regressor"``, ``"classifier"``, …).
:param dataset: An instance of `mlflow.models.evaluation.base._EvaluationDataset`
containing features and labels (optional) for model evaluation.
:param run_id: The ID of the MLflow Run to which to log results.
:param evaluator_config: A dictionary of additional configurations for
the evaluator.
:param kwargs: For forwards compatibility, a placeholder for additional arguments that
may be added to the evaluation interface in the future.
:return: An :py:class:`mlflow.models.EvaluationResult` instance containing
evaluation results.
"""
raise NotImplementedError()
[docs]def list_evaluators():
"""
Return a name list for all available Evaluators.
"""
# import _model_evaluation_registry inside function to avoid circuit importing
from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
return list(_model_evaluation_registry._registry.keys())
@contextmanager
def _start_run_or_reuse_active_run():
"""
A manager context return:
- If there's an active run, return the active run id.
- otherwise start a mflow run with the specified run_id,
if specified run_id is None, start a new run.
"""
active_run = mlflow.active_run()
if not active_run:
# Note `mlflow.start_run` throws if `run_id` is not found.
with mlflow.start_run() as run:
yield run.info.run_id
else:
yield active_run.info.run_id
def _normalize_evaluators_and_evaluator_config_args(
evaluators,
evaluator_config,
):
from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map):
return isinstance(_evaluator_name_to_conf_map, dict) and all(
k in _evaluator_name_list and isinstance(v, dict)
for k, v in _evaluator_name_to_conf_map.items()
)
if evaluators is None:
evaluator_name_list = list(_model_evaluation_registry._registry.keys())
if len(evaluator_name_list) > 1:
_logger.warning(
f"Multiple registered evaluators are found {evaluator_name_list} and "
"they will all be used in evaluation if they support the specified model type. "
"If you want to evaluate with one evaluator, specify the `evaluator` argument "
"and optionally specify the `evaluator_config` argument."
)
if evaluator_config is not None:
conf_dict_value_error = ValueError(
"If `evaluators` argument is None, all available evaluators will be used. "
"If only the default evaluator is available, the `evaluator_config` argument is "
"interpreted as the config dictionary for the default evaluator. Otherwise, the "
"`evaluator_config` argument must be a dictionary mapping each evaluator's name "
"to its own evaluator config dictionary."
)
if evaluator_name_list == ["default"]:
if not isinstance(evaluator_config, dict):
raise conf_dict_value_error
elif "default" not in evaluator_config:
evaluator_name_to_conf_map = {"default": evaluator_config}
else:
evaluator_name_to_conf_map = evaluator_config
else:
if not check_nesting_config_dict(evaluator_name_list, evaluator_config):
raise conf_dict_value_error
evaluator_name_to_conf_map = evaluator_config
else:
evaluator_name_to_conf_map = {}
elif isinstance(evaluators, str):
if not (evaluator_config is None or isinstance(evaluator_config, dict)):
raise ValueError(
"If `evaluators` argument is the name of an evaluator, evaluator_config must be "
"None or a dict containing config items for the evaluator."
)
evaluator_name_list = [evaluators]
evaluator_name_to_conf_map = {evaluators: evaluator_config}
elif isinstance(evaluators, list):
if evaluator_config is not None:
if not check_nesting_config_dict(evaluators, evaluator_config):
raise ValueError(
"If `evaluators` argument is an evaluator name list, evaluator_config "
"must be a dict contains mapping from evaluator name to individual "
"evaluator config dict."
)
# Use `OrderedDict.fromkeys` to deduplicate elements but keep elements order.
evaluator_name_list = list(OrderedDict.fromkeys(evaluators))
evaluator_name_to_conf_map = evaluator_config or {}
else:
raise ValueError(
"`evaluators` argument must be None, an evaluator name string, or a list of "
"evaluator names."
)
return evaluator_name_list, evaluator_name_to_conf_map
_last_failed_evaluator = None
def _get_last_failed_evaluator():
"""
Return the evaluator name of the last failed evaluator when calling `evalaute`.
This can be used to check which evaluator fail when `evaluate` API fail.
"""
return _last_failed_evaluator
def _evaluate(
*, model, model_type, dataset, run_id, evaluator_name_list, evaluator_name_to_conf_map
):
"""
The public API "evaluate" will verify argument first, and then pass normalized arguments
to the _evaluate method.
"""
# import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
global _last_failed_evaluator
_last_failed_evaluator = None
client = mlflow.tracking.MlflowClient()
model_uuid = model.metadata.model_uuid
dataset._log_dataset_tag(client, run_id, model_uuid)
eval_results = []
for evaluator_name in evaluator_name_list:
config = evaluator_name_to_conf_map.get(evaluator_name) or {}
try:
evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
except MlflowException:
_logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
continue
_last_failed_evaluator = evaluator_name
if evaluator.can_evaluate(model_type=model_type, evaluator_config=config):
_logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
result = evaluator.evaluate(
model=model,
model_type=model_type,
dataset=dataset,
run_id=run_id,
evaluator_config=config,
)
eval_results.append(result)
_last_failed_evaluator = None
if len(eval_results) == 0:
raise ValueError(
"The model could not be evaluated by any of the registered evaluators, please "
"verify that the model type and other configs are set correctly."
)
merged_eval_result = EvaluationResult(dict(), dict())
for eval_result in eval_results:
merged_eval_result.metrics.update(eval_result.metrics)
merged_eval_result.artifacts.update(eval_result.artifacts)
return merged_eval_result
[docs]@experimental
def evaluate(
model: Union[str, "mlflow.pyfunc.PyFuncModel"],
data,
*,
targets,
model_type: str,
dataset_name=None,
dataset_path=None,
feature_names: list = None,
evaluators=None,
evaluator_config=None,
):
"""
Evaluate a PyFunc model on the specified dataset using one or more specified ``evaluators``, and
log resulting metrics & artifacts to MLflow Tracking. For additional overview information, see
:ref:`the Model Evaluation documentation <model-evaluation>`.
Default Evaluator behavior:
- The default evaluator, which can be invoked with ``evaluators="default"`` or
``evaluators=None``, supports the ``"regressor"`` and ``"classifer"`` model types.
It generates a variety of model performance metrics, model performance plots, and
model explanations.
- For both the ``"regressor"`` and ``"classifer"`` model types, the default evaluator generates
model summary plots and feature importance plots using
`SHAP <https://shap.readthedocs.io/en/latest/index.html>`_.
- For regressor models, the default evaluator additionally logs:
- **metrics**: example_count, mean_absolute_error, mean_squared_error,
root_mean_squared_error, sum_on_label, mean_on_label, r2_score, max_error,
mean_absolute_percentage_error.
- For binary classifiers, the default evaluator additionally logs:
- **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall,
precision, f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
- **artifacts**: lift curve plot, precision-recall plot, ROC plot.
- For multiclass classifiers, the default evaluator additionally logs:
- **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss
- **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes
true_negatives/false_positives/false_negatives/true_positives/recall/precision/roc_auc,
precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
- The logged MLflow metric keys are constructed using the format:
``{metric_name}_on_{dataset_name}``. Any preexisting metrics with the same name are
overwritten.
- The metrics/artifacts listed above are logged to the active MLflow run.
If no active run exists, a new MLflow run is created for logging these metrics and
artifacts.
- Additionally, information about the specified dataset - hash, name (if specified), path
(if specified), and the UUID of the model that evaluated it - is logged to the
``mlflow.datasets`` tag.
- The available ``evaluator_config`` options for the default evaluator include:
- **log_model_explainability**: A boolean value specifying whether or not to log model
explainability insights, default value is True.
- **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
explainability. Supported algorithm includes: 'exact', 'permutation', 'partition'.
If not set, ``shap.Explainer`` is used with the "auto" algorithm, which chooses the best
Explainer based on the model.
- **explainability_nsamples**: The number of sample rows to use for computing model
explainability insights. Default value is 2000.
- **max_classes_for_multiclass_roc_pr**:
For multiclass classification tasks, the maximum number of classes for which to log
the per-class ROC curve and Precision-Recall curve. If the number of classes is
larger than the configured maximum, these curves are not logged.
- Limitations of evaluation dataset:
- For classification tasks, dataset labels are used to infer the total number of classes.
- For binary classification tasks, the negative label value must be 0 or -1 or False, and
the positive label value must be 1 or True.
- Limitations of metrics/artifacts computation:
- For classification tasks, some metric and artifact computations require the model to
output class probabilities. Currently, for scikit-learn models, the default evaluator
calls the ``predict_proba`` method on the underlying model to obtain probabilities. For
other model types, the default evaluator does not compute metrics/artifacts that require
probability outputs.
- Limitations of default evaluator logging model explainability insights:
- The ``shap.Explainer`` ``auto`` algorithm uses the ``Linear`` explainer for linear models
and the ``Tree`` explainer for tree models. Because SHAP's ``Linear`` and ``Tree``
explainers do not support multi-class classification, the default evaluator falls back to
using the ``Exact`` or ``Permutation`` explainers for multi-class classification tasks.
- Logging model explainability insights is not currently supported for PySpark models.
- The evaluation dataset label values must be numeric or boolean, all feature values
must be numeric, and each feature column must only contain scalar values.
:param model: A pyfunc model instance, or a URI referring to such a model.
:param data: One of the following:
- A numpy array or list of evaluation features, excluding labels.
- A Pandas DataFrame or Spark DataFrame, containing evaluation features and
labels. If ``feature_names`` argument not specified, all columns are regarded
as feature columns. Otherwise, only column names present in ``feature_names``
are regarded as feature columns.
:param targets: If ``data`` is a numpy array or list, a numpy array or list of evaluation
labels. If ``data`` is a DataFrame, the string name of a column from ``data``
that contains evaluation labels.
:param model_type: A string describing the model type. The default evaluator
supports ``"regressor"`` and ``"classifier"`` as model types.
:param dataset_name: (Optional) The name of the dataset, must not contain double quotes (``“``).
The name is logged to the ``mlflow.datasets`` tag for lineage tracking
purposes. If not specified, the dataset hash is used as the dataset name.
:param dataset_path: (Optional) The path where the data is stored. Must not contain double
quotes (``“``). If specified, the path is logged to the ``mlflow.datasets``
tag for lineage tracking purposes.
:param feature_names: (Optional) If the ``data`` argument is a feature data numpy array or list,
``feature_names`` is a list of the feature names for each feature. If
``None``, then the ``feature_names`` are generated using the format
``feature_{feature_index}``. If the ``data`` argument is a Pandas
DataFrame or a Spark DataFrame, ``feature_names`` is a list of the names
of the feature columns in the DataFrame. If ``None``, then all columns
except the label column are regarded as feature columns.
:param evaluators: The name of the evaluator to use for model evaluation, or a list of
evaluator names. If unspecified, all evaluators capable of evaluating the
specified model on the specified dataset are used. The default evaluator
can be referred to by the name ``"default"``. To see all available
evaluators, call :py:func:`mlflow.models.list_evaluators`.
:param evaluator_config: A dictionary of additional configurations to supply to the evaluator.
If multiple evaluators are specified, each configuration should be
supplied as a nested dictionary whose key is the evaluator name.
:return: An :py:class:`mlflow.models.EvaluationResult` instance containing
evaluation results.
"""
from mlflow.pyfunc import PyFuncModel
if isinstance(model, str):
model = mlflow.pyfunc.load_model(model)
elif isinstance(model, PyFuncModel):
pass
else:
raise ValueError(
"The model argument must be a string URI referring to an MLflow model or "
"an instance of `mlflow.pyfunc.PyFuncModel`."
)
(
evaluator_name_list,
evaluator_name_to_conf_map,
) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
dataset = EvaluationDataset(
data,
targets=targets,
name=dataset_name,
path=dataset_path,
feature_names=feature_names,
)
with _start_run_or_reuse_active_run() as run_id:
return _evaluate(
model=model,
model_type=model_type,
dataset=dataset,
run_id=run_id,
evaluator_name_list=evaluator_name_list,
evaluator_name_to_conf_map=evaluator_name_to_conf_map,
)