import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb


from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report

/Users/Steve/anaconda3/envs/PythonData/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

%matplotlib inline
rcParams['figure.figsize'] = 10, 8
sb.set_style('whitegrid')

# import file and store into variable name
data = "resources/kickstart201801_cleaned.csv"

# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
data_df = pd.read_csv(data, encoding="ISO-8859-1")

data_df.head()

from datetime import datetime

#converting launch string to datetime type
data_df['launched_timestamp'] = pd.to_datetime(data_df['launched'], infer_datetime_format=True)
data_df.head()

#converting deadline string to datatime type
data_df['deadline_timestamp'] = pd.to_datetime(data_df['deadline'], infer_datetime_format=True)
data_df.head()

#calculating duration of each project
data_df['duration'] = pd.Series(delta.days for delta in (data_df['deadline_timestamp'] - data_df['launched_timestamp']))
data_df.head()

#view which columns have null values and how many
print(data_df.isnull().sum())

ID                    0
name                  0
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged           0
usd_pledged_real      0
usd_goal_real         0
launched_timestamp    0
deadline_timestamp    0
duration              0
dtype: int64

#value counts on main category
data_df['main_category'].value_counts()

Film & Video    62696
Music           49530
Publishing      39379
Games           35225
Technology      32562
Design          30066
Art             28152
Food            24599
Fashion         22812
Theater         10912
Comics          10819
Photography     10778
Crafts           8809
Journalism       4754
Dance            3767
Name: main_category, dtype: int64

#dropping columns we are not going to use for model
data_df = data_df.drop(['ID','name','category','currency', 'deadline', 'launched', 'country','goal','pledged','usd pledged', 'usd_pledged_real','launched_timestamp','deadline_timestamp' ], 1)
data_df.head()

#turning our Y variable of success into binary 1 and 0
data_df['success'] = np.where(data_df.state == 'successful', 1, 0)
data_df.head()

#dropping more columns we wont use for our model
data_df = data_df.drop(['state', 'backers'], 1)
data_df.head()

#get dummies for main category column and store into own df
main_cat_dmy = pd.get_dummies(data_df['main_category'], prefix_sep="_", drop_first=True)
main_cat_dmy.head()

#drop main category
data_df.drop(['main_category'],axis=1,inplace=True)
data_df.head()

#concatenate dummies with data df
data_dmy = pd.concat([data_df,main_cat_dmy],axis=1)
data_dmy.head()

X = data_dmy.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15]]
y = data_dmy.iloc[:,2]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6430362573455995
Testing Data Score: 0.642684379946291

predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
confusion_matrix

array([[72275,     0],
       [40183,     0]])

The results from the confusion matrix are telling us that 72275 and 0 are the number of correct predictions. 40183 and 0 are the number of incorrect predictions.

print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.64      1.00      0.78     72275
          1       0.00      0.00      0.00     40183

avg / total       0.41      0.64      0.50    112458

/Users/Steve/anaconda3/envs/PythonData/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.

The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.

The support is the number of occurrences of each class in y_test.

Taking the independent variables of main category and goal amount, we can predict success with 64% accuracy. To better tune our model, maybe we can use more features such as sub categories, country location, or length of project.¶

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.0	failed	0	GB	0.0	0.0	1533.95
1	1000003930	Greeting From Earth: ZGAC Arts Capsule For ET	Narrative Film	Film & Video	USD	2017-11-01	30000.0	2017-09-02 04:43:57	2421.0	failed	15	US	100.0	2421.0	30000.00
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.0	failed	3	US	220.0	220.0	45000.00
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.0	failed	1	US	1.0	1.0	5000.00
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.0	canceled	14	US	1283.0	1283.0	19500.00

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real	launched_timestamp
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.0	failed	0	GB	0.0	0.0	1533.95	2015-08-11 12:12:28
1	1000003930	Greeting From Earth: ZGAC Arts Capsule For ET	Narrative Film	Film & Video	USD	2017-11-01	30000.0	2017-09-02 04:43:57	2421.0	failed	15	US	100.0	2421.0	30000.00	2017-09-02 04:43:57
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.0	failed	3	US	220.0	220.0	45000.00	2013-01-12 00:20:50
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.0	failed	1	US	1.0	1.0	5000.00	2012-03-17 03:24:11
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.0	canceled	14	US	1283.0	1283.0	19500.00	2015-07-04 08:35:03

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real	launched_timestamp	deadline_timestamp
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.0	failed	0	GB	0.0	0.0	1533.95	2015-08-11 12:12:28	2015-10-09
1	1000003930	Greeting From Earth: ZGAC Arts Capsule For ET	Narrative Film	Film & Video	USD	2017-11-01	30000.0	2017-09-02 04:43:57	2421.0	failed	15	US	100.0	2421.0	30000.00	2017-09-02 04:43:57	2017-11-01
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.0	failed	3	US	220.0	220.0	45000.00	2013-01-12 00:20:50	2013-02-26
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.0	failed	1	US	1.0	1.0	5000.00	2012-03-17 03:24:11	2012-04-16
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.0	canceled	14	US	1283.0	1283.0	19500.00	2015-07-04 08:35:03	2015-08-29

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real	launched_timestamp	deadline_timestamp	duration
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.0	failed	0	GB	0.0	0.0	1533.95	2015-08-11 12:12:28	2015-10-09	58
1	1000003930	Greeting From Earth: ZGAC Arts Capsule For ET	Narrative Film	Film & Video	USD	2017-11-01	30000.0	2017-09-02 04:43:57	2421.0	failed	15	US	100.0	2421.0	30000.00	2017-09-02 04:43:57	2017-11-01	59
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.0	failed	3	US	220.0	220.0	45000.00	2013-01-12 00:20:50	2013-02-26	44
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.0	failed	1	US	1.0	1.0	5000.00	2012-03-17 03:24:11	2012-04-16	29
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.0	canceled	14	US	1283.0	1283.0	19500.00	2015-07-04 08:35:03	2015-08-29	55

	Prediction	Actual
154259	0	0
50036	0	1
176911	0	0
284196	0	0
104965	0	0
294841	0	0
205935	0	1
123475	0	0
225886	0	1
132483	0	1
40495	0	0
242371	0	1
289283	0	0
227102	0	0
284551	0	1
99713	0	0
12708	0	0
357879	0	0
103027	0	1
291681	0	1
365795	0	0
216444	0	0
24976	0	0
303340	0	1
88880	0	0
251675	0	1
148805	0	0
177130	0	1
163468	0	0
116514	0	0
...	...	...
12022	0	0
248672	0	0
13148	0	0
138629	0	1
102604	0	0
115378	0	0
296323	0	0
304036	0	1
244718	0	0
243199	0	0
121195	0	0
280298	0	0
243109	0	0
270883	0	0
154403	0	0
94411	0	0
127833	0	0
99903	0	0
20774	0	0
322446	0	1
323852	0	1
56258	0	1
218689	0	1
64377	0	0
321446	0	1
104524	0	1
321069	0	1
314587	0	1
127473	0	0
140978	0	0

	Film & Video	Music	Publishing
0	0	0	1
1	1	0	0
2	1	0	0
3	0	1	0
4	1	0	0

	Film & Video	Music	Publishing
0	0	0	1
1	1	0	0
2	1	0	0
3	0	1	0
4	1	0	0

	Film & Video	Music	Publishing
0	0	0	1
1	1	0	0
2	1	0	0
3	0	1	0
4	1	0	0