import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
sb.set_style('whitegrid')
# import file and store into variable name
data = "resources/kickstart201801_cleaned.csv"
# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
data_df = pd.read_csv(data, encoding="ISO-8859-1")
data_df.head()
from datetime import datetime
#converting launch string to datetime type
data_df['launched_timestamp'] = pd.to_datetime(data_df['launched'], infer_datetime_format=True)
data_df.head()
#converting deadline string to datatime type
data_df['deadline_timestamp'] = pd.to_datetime(data_df['deadline'], infer_datetime_format=True)
data_df.head()
#calculating duration of each project
data_df['duration'] = pd.Series(delta.days for delta in (data_df['deadline_timestamp'] - data_df['launched_timestamp']))
data_df.head()
#view which columns have null values and how many
print(data_df.isnull().sum())
#value counts on main category
data_df['main_category'].value_counts()
#dropping columns we are not going to use for model
data_df = data_df.drop(['ID','name','category','currency', 'deadline', 'launched', 'country','goal','pledged','usd pledged', 'usd_pledged_real','launched_timestamp','deadline_timestamp' ], 1)
data_df.head()
#turning our Y variable of success into binary 1 and 0
data_df['success'] = np.where(data_df.state == 'successful', 1, 0)
data_df.head()
#dropping more columns we wont use for our model
data_df = data_df.drop(['state', 'backers'], 1)
data_df.head()
#get dummies for main category column and store into own df
main_cat_dmy = pd.get_dummies(data_df['main_category'], prefix_sep="_", drop_first=True)
main_cat_dmy.head()
#drop main category
data_df.drop(['main_category'],axis=1,inplace=True)
data_df.head()
#concatenate dummies with data df
data_dmy = pd.concat([data_df,main_cat_dmy],axis=1)
data_dmy.head()
X = data_dmy.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15]]
y = data_dmy.iloc[:,2]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
confusion_matrix
The results from the confusion matrix are telling us that 72275 and 0 are the number of correct predictions. 40183 and 0 are the number of incorrect predictions.
print(classification_report(y_test, predictions))
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.
The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.
The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.
The support is the number of occurrences of each class in y_test.