Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TypeError: _inplace_paired_L2() missing 2 required positional arguments: 'A' and 'B' #312

Open
angelotc opened this issue Mar 31, 2021 · 12 comments
Labels

Comments

@angelotc
Copy link

Description

I get this error TypeError: _inplace_paired_L2() missing 2 required positional arguments: 'A' and 'B'

Steps/Code to Reproduce

Example:

from sklearn.datasets import make_friedman1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def friedman_np_to_df(X,y):
  return pd.DataFrame(X,columns=['x0','x1', 'x2', 'x3', 'x4']), pd.Series(y)

# Make training set
X_train, NA = make_friedman1(n_samples=1000, n_features=5, random_state = 1) #dont care about Y so call it NA
X_train, NA = friedman_np_to_df(X_train,NA)



#categorize training set based off of x0
domain_list = []
for i in range(len(X_train)):
  if X_train.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)


X_train['domain'] = domain_list
# Set training set to where domain == 1 (x0 < 0.5)
X_train =  X_train[X_train['domain']==1]
y_train = X_train.copy()
X_train = X_train.drop(columns = ['domain'])
y_train = y_train['domain']


# Make testing set with a different random_state
X_test, NA2 = make_friedman1(n_samples=1000, n_features=5, random_state = 3)
X_test, NA2 = friedman_np_to_df(X_test,NA2)


#categorize testing set based off of x0
domain_list = []
for i in range(len(X_test)):
  if X_test.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)
X_test['domain'] = domain_list

y_test = X_test['domain'].copy()
X_test = X_test.drop(columns = ['domain'])


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from metric_learn import LMNN
lmnn_knn = Pipeline(steps=[('lmnn', LMNN()), ('knn', KNeighborsClassifier())])
parameters = {'lmnn__k':[1, 2,3], 'knn__n_neighbors':[1 , 2]}
grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters, n_jobs=-1, verbose=True)
grid_lmnn_knn.fit(X_train,y_train)
grid_lmnn_knn.score(X_test, y_test)

Expected Results

Example: No error is thrown. Score is calculated

Actual Results

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.5s finished
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-54-e89c6a61ea02> in <module>()
      6 parameters = {'lmnn__k':[1, 2,3], 'knn__n_neighbors':[1 , 2]}
      7 grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters, n_jobs=-1, verbose=True)
----> 8 grid_lmnn_knn.fit(X_train,y_train)
      9 grid_lmnn_knn.score(X_test, y_test)
     10 

7 frames
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    737             refit_start_time = time.time()
    738             if y is not None:
--> 739                 self.best_estimator_.fit(X, y, **fit_params)
    740             else:
    741                 self.best_estimator_.fit(X, **fit_params)

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    348             This estimator
    349         """
--> 350         Xt, fit_params = self._fit(X, y, **fit_params)
    351         with _print_elapsed_time('Pipeline',
    352                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    313                 message_clsname='Pipeline',
    314                 message=self._log_message(step_idx),
--> 315                 **fit_params_steps[name])
    316             # Replace the transformer of the step with the fitted
    317             # transformer. This is necessary when loading the transformer

/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    726     with _print_elapsed_time(message_clsname, message):
    727         if hasattr(transformer, 'fit_transform'):
--> 728             res = transformer.fit_transform(X, y, **fit_params)
    729         else:
    730             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    572         else:
    573             # fit method of arity 2 (supervised transformation)
--> 574             return self.fit(X, y, **fit_params).transform(X)
    575 
    576 

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in fit(self, X, y)
    180     G, objective, total_active = self._loss_grad(X, L, dfG, k,
    181                                                  reg, target_neighbors,
--> 182                                                  label_inds)
    183 
    184     it = 1  # we already made one iteration

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds)
    246                                      label_inds, L)
    247 
--> 248     g0 = _inplace_paired_L2(*Lx[impostors])
    249 
    250     # we reorder the target neighbors

TypeError: _inplace_paired_L2() missing 2 required positional arguments: 'A' and 'B'

Versions

Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
Python 3.7.10 (default, Feb 20 2021, 21:17:23)
[GCC 7.5.0]
NumPy 1.19.5
SciPy 1.4.1
Scikit-Learn 0.22.2.post1
Metric-Learn 0.6.2

@perimosocordiae
Copy link
Contributor

Looks like either Lx or impostors was empty when computing the gradient of the loss.

Is X_train a numpy array of Pandas dataframe in your call to grid_lmnn_knn.fit(X_train,y_train)? If it's a dataframe, could you try again using a plain numpy array?

In any case, we should do better checking to surface a less-opaque error message.

@angelotc
Copy link
Author

angelotc commented Apr 1, 2021

Yep, I have tried that .

Replaced last 2 lines with this:

grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
grid_lmnn_knn.score(np.array(X_test), np.array(y_test))

Any other thoughts you can think of?

@perimosocordiae
Copy link
Contributor

I reproduced the issue locally, and it turns out that impostors is indeed empty when computing the gradient. See similar issue gh-17 which apparently didn't result in a fix for this same issue.

I haven't verified yet, but I suspect the new LMNN implementation coming in gh-309 will solve this for you. We should also make sure we add test coverage for the no-impostors case.

@angelotc
Copy link
Author

angelotc commented Apr 1, 2021

Sounds good. I will patiently wait for that. If you have any workaround until then, let me know as I have to present on my findings by next Wednesday to my research group lol

@perimosocordiae
Copy link
Contributor

Here's a workaround. It just bails out entirely if no impostors can be found:
612fcc4

Not super elegant, but it should work okay.

@angelotc
Copy link
Author

angelotc commented Apr 2, 2021

Thank you for your work @perimosocordiae .

We are trying to apply metric learning to the materials sciences space, but trying to apply this work to Friedman dataset first before going all in on the diffusion datasets. Super weird that my pipeline is now not predicting any of the 0s correctly in my test set. I am not sure if this is a correct approach. In case you were interested in our use case: my PI is telling me to frame a classification problem with the Friedman dataset. Categorize the dataset and set samples where x0 < 0.6 to 1 (sample is within domain), else 0. Then apply metric-learning to that and see if it performs well.

Code so far:

from sklearn.datasets import make_friedman1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def friedman_np_to_df(X,y):
  return pd.DataFrame(X,columns=['x0','x1', 'x2', 'x3', 'x4']), pd.Series(y)

# Make training set
X_train, NA = make_friedman1(n_samples=1000, n_features=5, random_state = 1) #dont care about Y so call it NA
X_train, NA = friedman_np_to_df(X_train,NA)



#categorize training set based off of x0
domain_list = []
for i in range(len(X_train)):
  if X_train.iloc[i]['x0'] < 0.6 :
    domain_list.append(1)
  else:
    domain_list.append(0)


X_train['domain'] = domain_list
# Set training set to where domain == 1 (x0 < 0.6)
X_train =  X_train[X_train['domain']==1]
y_train = X_train.copy()
X_train = X_train.drop(columns = ['domain'])
y_train = y_train['domain']


# Make testing set with a different random_state
X_test, NA2 = make_friedman1(n_samples=1000, n_features=5, random_state = 3)
X_test, NA2 = friedman_np_to_df(X_test,NA2)


#categorize testing set based off of x0
domain_list = []
for i in range(len(X_test)):
  if X_test.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)
X_test['domain'] = domain_list

y_test = X_test['domain'].copy()
X_test = X_test.drop(columns = ['domain'])


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from metric_learn import LMNN
lmnn_knn = Pipeline(steps=[('lmnn', LMNN()), ('knn', KNeighborsClassifier())])
parameters = {'lmnn__init': ['pca', 'lda', 'identity', 'random'],
              'lmnn__k':[2,3],
              'knn__n_neighbors':[2,3],
              'knn__weights': ['uniform','distance'],
              'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'knn__leaf_size': [x for x in np.arange(1,30,5)],
              'knn__metric': ['euclidian', 'manhattan', 'mahalanobis', 'seuclidian', 'minkowski']}
grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters,cv = 3, n_jobs=-1, verbose=True, scoring='f1')
grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
# grid_lmnn_knn.score(np.array(X_test), np.array(y_test))

predictions = grid_lmnn_knn.predict(X_test)
print(grid_lmnn_knn.best_estimator_)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

output is:

Pipeline(memory=None,
         steps=[('lmnn',
                 LMNN(convergence_tol=0.001, init='pca', k=2, learn_rate=1e-07,
                      max_iter=1000, min_iter=50, n_components=None,
                      preprocessor=None, random_state=None, regularization=0.5,
                      verbose=False)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=1,
                                      metric='manhattan', metric_params=None,
                                      n_jobs=None, n_neighbors=2, p=2,
                                      weights='uniform'))],
         verbose=False)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       387
           1       0.61      1.00      0.76       613

    accuracy                           0.61      1000
   macro avg       0.31      0.50      0.38      1000
weighted avg       0.38      0.61      0.47      1000

/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

So yeah. Not sure why f1-score is 0 for my 0 cases. Maybe I am doing it wrong haha.

@angelotc
Copy link
Author

angelotc commented Apr 4, 2021

So I think the reason why my last attempt had bad performance on the Friedman dataset was because there were no examples of 0-labeled data in the training set. Now, I include samples of 0-labeled data in the training set. @perimosocordiae I found another issue with your branch:

from sklearn.datasets import make_friedman1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def friedman_np_to_df(X,y):
  return pd.DataFrame(X,columns=['x0','x1', 'x2', 'x3', 'x4']), pd.Series(y)

# Make training set
X_train, NA = make_friedman1(n_samples=1000, n_features=5, random_state = 1) #dont care about Y so call it NA
X_train, NA = friedman_np_to_df(X_train,NA)

#categorize training set based off of x0
domain_list = []
for i in range(len(X_train)):
  if X_train.iloc[i]['x0'] < 0.6 :
    domain_list.append(1)
  else:
    domain_list.append(0)


X_train['domain'] = domain_list
# Set training set to where domain == 1 (x0 < 0.6)

out_of_domain = X_train[X_train['domain'] == 0][:60]
X_train =  X_train[X_train['domain']==1]

X_train = pd.concat([out_of_domain, X_train])

y_train = X_train.copy()
X_train = X_train.drop(columns = ['domain'])
y_train = y_train['domain']


# Make testing set with a different random_state
X_test, NA2 = make_friedman1(n_samples=1000, n_features=5, random_state = 3)
X_test, NA2 = friedman_np_to_df(X_test,NA2)


#categorize testing set based off of x0
domain_list = []
for i in range(len(X_test)):
  if X_test.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)
X_test['domain'] = domain_list

y_test = X_test['domain'].copy()
X_test = X_test.drop(columns = ['domain'])


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from metric_learn import LMNN
lmnn_knn = Pipeline(steps=[('lmnn', LMNN()), ('knn', KNeighborsClassifier())])
parameters = {'lmnn__init': ['pca', 'lda', 'identity', 'random'],
              'lmnn__k':[2,3],
              'knn__n_neighbors':[2,3],
              'knn__weights': ['uniform','distance'],
              'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'knn__leaf_size': [x for x in np.arange(1,30,5)],
              'knn__metric': [ 'manhattan', 'mahalanobis', 'minkowski']}
grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters,cv = 5, n_jobs=-1, verbose=True, scoring='f1')
grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
grid_lmnn_knn.score(np.array(X_test), np.array(y_test))

Output:

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 1402 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 3402 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 6202 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 9802 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 11520 out of 11520 | elapsed:  3.1min finished
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-945bbdb8e331> in <module>()
     64               'knn__metric': [ 'manhattan', 'mahalanobis', 'minkowski']}
     65 grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters,cv = 5, n_jobs=-1, verbose=True, scoring='f1')
---> 66 grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
     67 grid_lmnn_knn.score(np.array(X_test), np.array(y_test))
     68 

7 frames
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    737             refit_start_time = time.time()
    738             if y is not None:
--> 739                 self.best_estimator_.fit(X, y, **fit_params)
    740             else:
    741                 self.best_estimator_.fit(X, **fit_params)

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    348             This estimator
    349         """
--> 350         Xt, fit_params = self._fit(X, y, **fit_params)
    351         with _print_elapsed_time('Pipeline',
    352                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    313                 message_clsname='Pipeline',
    314                 message=self._log_message(step_idx),
--> 315                 **fit_params_steps[name])
    316             # Replace the transformer of the step with the fitted
    317             # transformer. This is necessary when loading the transformer

/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    726     with _print_elapsed_time(message_clsname, message):
    727         if hasattr(transformer, 'fit_transform'):
--> 728             res = transformer.fit_transform(X, y, **fit_params)
    729         else:
    730             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    572         else:
    573             # fit method of arity 2 (supervised transformation)
--> 574             return self.fit(X, y, **fit_params).transform(X)
    575 
    576 

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in fit(self, X, y)
    180     G, objective, total_active = self._loss_grad(X, L, dfG, k,
    181                                                  reg, target_neighbors,
--> 182                                                  label_inds)
    183     if G is None:
    184       # TODO: raise a warning

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds)
    249     impostors = self._find_impostors(furthest_neighbors.ravel(), X,
    250                                      label_inds, L)
--> 251     if not impostors:
    252       return None, 0, 0
    253 

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

@angelotc
Copy link
Author

angelotc commented Apr 4, 2021

Tried swapping it to the following, but it now goes into an infinite loop 😂

if not impostors.any():
   return None, 0, 0

@perimosocordiae
Copy link
Contributor

perimosocordiae commented Apr 4, 2021 via email

@angelotc
Copy link
Author

angelotc commented Apr 4, 2021

image
Super weird.

@perimosocordiae
Copy link
Contributor

Were you able to try the code from gh-309? I'm curious to see how it would handle this case.

@angelotc
Copy link
Author

angelotc commented Apr 5, 2021

The code from the PR doesn't work as well 🤣I will post my results on that thread.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants