You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import pandas as pd
import xlearn as xl
train = pd.read_csv('ffm_train.csv')
import warnings
warnings.filterwarnings('ignore')
cols = ['Education','ApplicantIncome','Loan_Status','Credit_History']
train_sub = train[cols]
train_sub['Credit_History'].fillna(0, inplace = True)
dict_ls = {'Y':1, 'N':0}
train_sub['Loan_Status'].replace(dict_ls, inplace = True)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(train_sub, test_size = 0.3, random_state = 5)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.datasets import make_classification
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y != None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
########################### Lets build some data and test ############################
###
ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(X_train, y='Loan_Status')
ffm_test = FFMFormatPandas()
ffm_test_data = ffm_train.fit_transform(X_test, y='Loan_Status')
ffm_train_data.to_csv("./model_out/train_ffm.txt") # these snippets
ffm_test_data.to_csv("./model_out/test_ffm.txt") #
training loss was very low and thought it worked well,
but the problem was that predicted values were almost the same like 0.99999, 0.99998 or something.
I tried to figure out why this happened and finally found that my processed .txt data included "indices" from the original pandas dataframe.
I added index=False argument to final code snippets, and I found it works.
hope this can help someone.
The text was updated successfully, but these errors were encountered:
Hi! I'd like to share my problem I suffered using xlearn binary classifier model.
I modified the code from this tutorial.
https://www.analyticsvidhya.com/blog/2018/01/factorization-machines/
training loss was very low and thought it worked well,
but the problem was that predicted values were almost the same like 0.99999, 0.99998 or something.
I tried to figure out why this happened and finally found that my processed .txt data included "indices" from the original pandas dataframe.
I added index=False argument to final code snippets, and I found it works.
hope this can help someone.
The text was updated successfully, but these errors were encountered: