davidsbatista/text-classification

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

SlowMonk opened this issue · 2 comments

y_value = array([221900, 180000, 510000, ..., 360000, 400000, 325000])

List of machine learning algorithms that will be used for predictions

estimator = [('Logistic Regression', LogisticRegression), ('Ridge Classifier', RidgeClassifier),
('SGD Classifier', SGDClassifier), ('Passive Aggressive Classifier', PassiveAggressiveClassifier),
('SVC', SVC), ('Linear SVC', LinearSVC), ('Nu SVC', NuSVC),
('K-Neighbors Classifier', KNeighborsClassifier),
('Gaussian Naive Bayes', GaussianNB), ('Multinomial Naive Bayes', MultinomialNB),
('Bernoulli Naive Bayes', BernoulliNB), ('Complement Naive Bayes', ComplementNB),
('Decision Tree Classifier', DecisionTreeClassifier),
('Random Forest Classifier', RandomForestClassifier), ('AdaBoost Classifier', AdaBoostClassifier),
('Gradient Boosting Classifier', GradientBoostingClassifier), ('Bagging Classifier', BaggingClassifier),
('Extra Trees Classifier', ExtraTreesClassifier), ('XGBoost', XGBClassifier)]

Separating independent features and dependent feature from the dataset

#X_train = titanic.drop(columns='Survived')
#y_train = titanic['Survived']

Creating a dataframe to compare the performance of the machine learning models

comparison_cols = ['Algorithm', 'Training Time (Avg)', 'Accuracy (Avg)', 'Accuracy (3xSTD)']
comparison_df = pd.DataFrame(columns=comparison_cols)

Generating training/validation dataset splits for cross validation

cv_split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

Performing cross-validation to estimate the performance of the models

for idx, est in enumerate(estimator):

cv_results = cross_validate(est[1](), x_value, y_value, cv=cv_split)

comparison_df.loc[idx, 'Algorithm'] = est[0]
comparison_df.loc[idx, 'Training Time (Avg)'] = cv_results['fit_time'].mean()
comparison_df.loc[idx, 'Accuracy (Avg)'] = cv_results['test_score'].mean()
comparison_df.loc[idx, 'Accuracy (3xSTD)'] = cv_results['test_score'].std() * 3

comparison_df.set_index(keys='Algorithm', inplace=True)
comparison_df.sort_values(by='Accuracy (Avg)', ascending=False, inplace=True)

#Visualizing the performance of the models

and following error occured


ValueError Traceback (most recent call last)
in
25 for idx, est in enumerate(estimator):
26
---> 27 cv_results = cross_validate(est1, x_value, y_value, cv=cv_split)
28
29 comparison_df.loc[idx, 'Algorithm'] = est[0]

~/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))

~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in call(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919

~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
752 tasks = BatchedCalls(itertools.islice(iterator, batch_size),
753 self._backend.get_nested_backend(),
--> 754 self._pickle_cache)
755 if len(tasks) == 0:
756 # No more tasks available in the iterator: tell caller to stop.

~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in init(self, iterator_slice, backend_and_jobs, pickle_cache)
208
209 def init(self, iterator_slice, backend_and_jobs, pickle_cache=None):
--> 210 self.items = list(iterator_slice)
211 self._size = len(self.items)
212 if isinstance(backend_and_jobs, tuple):

~/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in (.0)
233 pre_dispatch=pre_dispatch)
234 scores = parallel(
--> 235 delayed(_fit_and_score)(
236 clone(estimator), X, y, scorers, train, test, verbose, None,
237 fit_params, return_train_score=return_train_score,

~/.local/lib/python3.6/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
1313 """
1314 X, y, groups = indexable(X, y, groups)
-> 1315 for train, test in self._iter_indices(X, y, groups):
1316 yield train, test
1317

~/.local/lib/python3.6/site-packages/sklearn/model_selection/_split.py in _iter_indices(self, X, y, groups)
1693 class_counts = np.bincount(y_indices)
1694 if np.min(class_counts) < 2:
-> 1695 raise ValueError("The least populated class in y has only 1"
1696 " member, which is too few. The minimum"
1697 " number of groups for any class cannot"

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

see issue #1

yes, but I want to use
cv_split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
instead
x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size=0.33, random_state=42)