You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
For a good UX, scikit-learn will attempt to convert string dtyped columns of an input dataframe to floats by default during e.g., RandomForestClassifier.fit (and possibly other estimators). If it can't it will throw an error. We throw an error in both scenarios.
We should potentially do the same, pending performance impact.
importnumpyasnpimportpandasaspdfromsklearn.ensembleimportRandomForestClassifierN=1000categories= ['0', '1', '2'] # automatically coerced, as it assumes there is an implied ordinality# categories = ['a', 'b', 'c'] # errors, as it assumes cardinalitynumeric_data=np.random.rand(N, 5)
string_data=np.random.choice(categories, size=(N,1))
y=np.random.choice([0, 1], size=N)
X=pd.DataFrame(numeric_data)
X["str_col"] =string_dataX.columns= [f"x{i}"foriinrange(len(X.columns))]
clf=RandomForestClassifier()
clf.fit(X,y)
---------------------------------------------------------------------------ValueErrorTraceback (mostrecentcalllast)
/tmp/ipykernel_3546391/1974482336.pyin ?()
1clf=RandomForestClassifier()
---->2clf.fit(X,y)
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/base.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/base.py) in ?(estimator, *args, **kwargs)
1385skip_parameter_validation=(
1386prefer_skip_nested_validationorglobal_skip_validation1387 )
1388 ):
->1389returnfit_method(estimator, *args, **kwargs)
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/ensemble/_forest.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/ensemble/_forest.py) in ?(self, X, y, sample_weight)
356# Validate or convert input data357ifissparse(y):
358raiseValueError("sparse multilabel-indicator for y is not supported.")
359-->360X, y=validate_data(
361self,
362X,
363y,
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/validation.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/validation.py) in ?(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
2957if"estimator"notincheck_y_params:
2958check_y_params= {**default_check_params, **check_y_params}
2959y=check_array(y, input_name="y", **check_y_params)
2960else:
->2961X, y=check_X_y(X, y, **check_params)
2962out=X, y29632964ifnotno_val_Xandcheck_params.get("ensure_2d", True):
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/validation.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/validation.py) in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1366 )
13671368ensure_all_finite=_deprecate_force_all_finite(force_all_finite, ensure_all_finite)
1369->1370X=check_array(
1371X,
1372accept_sparse=accept_sparse,
1373accept_large_sparse=accept_large_sparse,
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/validation.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/validation.py) in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
1052 )
1053array=xp.astype(array, dtype, copy=False)
1054else:
1055array=_asarray_with_order(array, order=order, dtype=dtype, xp=xp)
->1056exceptComplexWarningascomplex_warning:
1057raiseValueError(
1058"Complex data not supported\n{}\n".format(array)
1059 ) fromcomplex_warning
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/_array_api.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/sklearn/utils/_array_api.py) in ?(array, dtype, order, copy, xp, device)
835# Use NumPy API to support order836ifcopyisTrue:
837array=numpy.array(array, order=order, dtype=dtype)
838else:
-->839array=numpy.asarray(array, order=order, dtype=dtype)
840841# At this point array is a NumPy ndarray. We convert it to an array842# container that is consistent with the input's namespace.
[/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/pandas/core/generic.py](http://10.176.1.125:8883/lab/tree/raid/nicholasb/raid/nicholasb/miniforge3/envs/cuml-25.02/lib/python3.11/site-packages/pandas/core/generic.py) in ?(self, dtype, copy)
2149def__array__(
2150self, dtype: npt.DTypeLike|None=None, copy: bool_t|None=None2151 ) ->np.ndarray:
2152values=self._values->2153arr=np.asarray(values, dtype=dtype)
2154if (
2155astype_is_view(values.dtype, arr.dtype)
2156andusing_copy_on_write()
ValueError: couldnotconvertstringtofloat: 'c'
The text was updated successfully, but these errors were encountered:
For a good UX, scikit-learn will attempt to convert string dtyped columns of an input dataframe to floats by default during e.g.,
RandomForestClassifier.fit
(and possibly other estimators). If it can't it will throw an error. We throw an error in both scenarios.We should potentially do the same, pending performance impact.
The text was updated successfully, but these errors were encountered: