Ребята, я новичок в глубоком обучении. Я обучал DNN по набору данных о доходах взрослых в США.
Где я на самом деле ошибся? и еще один вопрос. Я хочу протестировать свою модель на другом наборе данных. Как мне это сделать?
Это мой код:
import pandas as pd
input_data = pd.read_csv('adult.data.csv')
def label_fix(label):
if label == '<=50K':
return 0
else:
return 1
input_data['Income'] = input_data['Income'].apply(label_fix)
from sklearn.model_selection import train_test_split
x_data = input_data.drop('Income',axis = 1)
y_labels = input_data['Income']
X_train,X_test,y_train,y_test = train_test_split(x_data,y_labels,test_size= 0.3,random_state=101)
import tensorflow as tf
Age = tf.feature_column.numeric_column('Age')
Job_class = tf.feature_column.categorical_column_with_hash_bucket('Job-
Class',hash_bucket_size=1000)
fnlwgt = tf.feature_column.numeric_column('fnlwgt')
Education = tf.feature_column.categorical_column_with_hash_bucket('Education',hash_bucket_size=1000)
Education_num = tf.feature_column.numeric_column('Education-num')
Status = tf.feature_column.categorical_column_with_hash_bucket('Status',hash_bucket_size=1000)
Designation = tf.feature_column.categorical_column_with_hash_bucket('Designation',hash_bucket_size=1000)
Marital = tf.feature_column.categorical_column_with_hash_bucket('Marital',hash_bucket_size=1000)
Colour = tf.feature_column.categorical_column_with_vocabulary_list('Colour',['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'])
Gender = tf.feature_column.categorical_column_with_vocabulary_list('Gender',['Male','Female'])
Capital_gain = tf.feature_column.numeric_column('capital-gain')
Capital_loss = tf.feature_column.numeric_column('capital-loss')
Hours = tf.feature_column.numeric_column('hours-per-week')
Native_country = tf.feature_column.categorical_column_with_hash_bucket('Native-Country',hash_bucket_size=1000)
Income = tf.feature_column.numeric_column('Income')
feats_cols = [Age,Job_class,fnlwgt,Education,Education_num,Status,Designation,Marital,Colour,Gender,Capital_gain,Capital_loss,Hours,Native_country]
model = tf.estimator.LinearClassifier(feature_columns=feats_cols)
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=100,num_epochs=None,shuffle=True)
model.train(input_fn=input_func,steps = 5000)
среднее / общее 1,00 1,00 1,00 9769
pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=len(X_test),shuffle=False)
predictions = list(model.predict(input_fn=pred_fn))
final_preds=[]
for pred in predictions:
final_preds.append(pred['class_ids'][0])
from sklearn.metrics import classification_report
print(classification_report(y_test,final_preds))
precision recall f1-score support 1 1.00 1.00 1.00 9769
В вашем методе есть ошибка
. Поскольку значениеimport pandas as pd input_data = pd.read_csv('adult.data.csv') def label_fix(label): if label == '<=50K': return 0 else: return 1 input_data['Income'] = input_data['Income'].apply(label_fix) from sklearn.model_selection import train_test_split x_data = input_data.drop('Income',axis = 1) y_labels = input_data['Income'] X_train,X_test,y_train,y_test = train_test_split(x_data,y_labels,test_size= 0.3,random_state=101) import tensorflow as tf Age = tf.feature_column.numeric_column('Age') Job_class = tf.feature_column.categorical_column_with_hash_bucket('Job- Class',hash_bucket_size=1000) fnlwgt = tf.feature_column.numeric_column('fnlwgt') Education = tf.feature_column.categorical_column_with_hash_bucket('Education',hash_bucket_size=1000) Education_num = tf.feature_column.numeric_column('Education-num') Status = tf.feature_column.categorical_column_with_hash_bucket('Status',hash_bucket_size=1000) Designation = tf.feature_column.categorical_column_with_hash_bucket('Designation',hash_bucket_size=1000) Marital = tf.feature_column.categorical_column_with_hash_bucket('Marital',hash_bucket_size=1000) Colour = tf.feature_column.categorical_column_with_vocabulary_list('Colour',['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']) Gender = tf.feature_column.categorical_column_with_vocabulary_list('Gender',['Male','Female']) Capital_gain = tf.feature_column.numeric_column('capital-gain') Capital_loss = tf.feature_column.numeric_column('capital-loss') Hours = tf.feature_column.numeric_column('hours-per-week') Native_country = tf.feature_column.categorical_column_with_hash_bucket('Native-Country',hash_bucket_size=1000) Income = tf.feature_column.numeric_column('Income') feats_cols = [Age,Job_class,fnlwgt,Education,Education_num,Status,Designation,Marital,Colour,Gender,Capital_gain,Capital_loss,Hours,Native_country] model = tf.estimator.LinearClassifier(feature_columns=feats_cols) input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=100,num_epochs=None,shuffle=True) model.train(input_fn=input_func,steps = 5000)
всегда имеет префикс пробела (pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=len(X_test),shuffle=False) predictions = list(model.predict(input_fn=pred_fn)) final_preds=[] for pred in predictions: final_preds.append(pred['class_ids'][0]) from sklearn.metrics import classification_report print(classification_report(y_test,final_preds))
), _4_ всегда будет возвращать 1, что обеспечивает идеальный отзыв и точность. Если вы исправите свой метод для обработки начального пробела, вы получите более разумную точность и количество отзывов.precision recall f1-score support 1 1.00 1.00 1.00 9769