# import all packages and set plots to be embedded inline
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sb
    import datetime
    
    
    %matplotlib inline


                
                  #Read dataset into a df using pandas 
    
    LoanData = pd.read_csv('prosperLoanData.csv')


                
                  LoanData.sample(5)


                
                  # dataframe dimensions
    
    LoanData.shape

(113937, 81)


                
                  selectedCols = [
        'ListingKey', 'ListingCreationDate','CreditGrade', 
        'LoanStatus', 'BorrowerRate', 'ProsperScore', 
        'EmploymentStatus', 'EmploymentStatusDuration', 
        'IsBorrowerHomeowner',
        'PublicRecordsLast10Years', 'DebtToIncomeRatio', 'IncomeRange', 
        'LoanOriginalAmount', 'MonthlyLoanPayment'
         ]


                
                  loan_new = LoanData[selectedCols]


                
                  loan_new.info()

<class 'pandas.core.frame.DataFrame'>
    RangeIndex: 113937 entries, 0 to 113936
    Data columns (total 14 columns):
     #   Column                    Non-Null Count   Dtype  
    ---  ------                    --------------   -----  
     0   ListingKey                113937 non-null  object 
     1   ListingCreationDate       113937 non-null  object 
     2   CreditGrade               28953 non-null   object 
     3   LoanStatus                113937 non-null  object 
     4   BorrowerRate              113937 non-null  float64
     5   ProsperScore              84853 non-null   float64
     6   EmploymentStatus          111682 non-null  object 
     7   EmploymentStatusDuration  106312 non-null  float64
     8   IsBorrowerHomeowner       113937 non-null  bool   
     9   PublicRecordsLast10Years  113240 non-null  float64
     10  DebtToIncomeRatio         105383 non-null  float64
     11  IncomeRange               113937 non-null  object 
     12  LoanOriginalAmount        113937 non-null  int64  
     13  MonthlyLoanPayment        113937 non-null  float64
    dtypes: bool(1), float64(6), int64(1), object(6)
    memory usage: 11.4+ MB


                
                  # Retrieve summary stats and check for outliers
    
    loan_new.describe()


                
                  #Check for duplicates
    
    loan_new.duplicated().sum()

0


                
                  # dataframe name (loan_new) is hardcoded in the function:
    # Takes column name as param 
    # returns unique in indicated column
    
    def entries(columnName):
        #param: name of column
        global loan_new
        df = loan_new
        return df[columnName].unique()


                
                  #List unique entries in the LoanStatus column
    
    entries('LoanStatus')

array(['Completed', 'Current', 'Past Due (1-15 days)', 'Defaulted',
           'Chargedoff', 'Past Due (16-30 days)', 'Cancelled',
           'Past Due (61-90 days)', 'Past Due (31-60 days)',
           'Past Due (91-120 days)', 'FinalPaymentInProgress',
           'Past Due (>120 days)'], dtype=object)


                
                  #List unique entries in the EmploymentStatus column
    
    entries('EmploymentStatus')

array(['Self-employed', 'Employed', 'Not available', 'Full-time', 'Other',
           nan, 'Not employed', 'Part-time', 'Retired'], dtype=object)


                
                  #List unique entries in the IsBorrowerHomeowner column
    
    entries('IsBorrowerHomeowner')

array([ True, False])


                
                  #List unique entries in the IncomeRange column
    
    entries('IncomeRange')

array(['$25,000-49,999', '$50,000-74,999', 'Not displayed', '$100,000+',
           '$75,000-99,999', '$1-24,999', 'Not employed', '$0'], dtype=object)


                
                  #List unique entries in the IncomeRange column
    
    entries('CreditGrade')

array(['C', nan, 'HR', 'AA', 'D', 'B', 'E', 'A', 'NC'], dtype=object)


                
                  loan_clean = loan_new.copy()


                
                  #Change datatype to Datetime from string
    
    loan_clean['ListingCreationDate']= pd.to_datetime(loan_clean['ListingCreationDate'])


                
                  #Confirm that ListingCreationDate is of datetime datatype
    
    assert loan_clean['ListingCreationDate'].dtype == 'datetime64[ns]'


                
                  loan_clean.loc[(loan_clean.EmploymentStatus == 'Full-time' ), 'EmploymentStatus'] = 'Employed'


                
                  #Confirm that Full-time is not in the EmploymentStatus column
    
    assert 'Full-time' not in loan_clean.EmploymentStatus.unique()


                
                  loan_clean.IncomeRange.dtype

dtype('O')


                
                  loan_clean.IncomeRange.unique()

array(['$25,000-49,999', '$50,000-74,999', 'Not displayed', '$100,000+',
           '$75,000-99,999', '$1-24,999', 'Not employed', '$0'], dtype=object)


                
                  # loc: select rows based on multiple conditions
    # select rows with 'Not Employed' or 'Not displayed'
    # change values to '$0'
    
    loan_clean.loc[(loan_clean.IncomeRange == 'Not employed') | (loan_clean.IncomeRange == 'Not displayed'), 'IncomeRange'] = '$0'


                
                  # list unique items in the IncomeRange column
    
    loan_clean.IncomeRange.unique()

array(['$25,000-49,999', '$50,000-74,999', '$0', '$100,000+',
           '$75,000-99,999', '$1-24,999'], dtype=object)


                
                  # income ranges in a list 
    
    ordered_ranges = [
                      '$0',
                      '$1-24,999',
                      '$25,000-49,999',
                      '$50,000-74,999',
                      '$75,000-99,999',
                      '$100,000+'
                     ]


                
                  #categorical method takes in a list-like of values and sorted categories list
    
    loan_clean['IncomeRange'] = pd.Categorical(loan_clean['IncomeRange'], ordered_ranges, ordered=True)


                
                  # Select unique items from IncomeRange column and sort them
    
    loan_clean['IncomeRange'].unique().sort_values()

['$0', '$1-24,999', '$25,000-49,999', '$50,000-74,999', '$75,000-99,999', '$100,000+']
    Categories (6, object): ['$0' < '$1-24,999' < '$25,000-49,999' < '$50,000-74,999' < '$75,000-99,999' < '$100,000+']


                
                  loan_clean['IncomeRange'].dtype

CategoricalDtype(categories=['$0', '$1-24,999', '$25,000-49,999', '$50,000-74,999',
                      '$75,000-99,999', '$100,000+'],
    , ordered=True)


                
                  #confirm that datatype is category
    
    assert loan_clean['IncomeRange'].dtype == 'category'


                
                  #Unique entries in the CreditGrade column
    
    loan_clean['CreditGrade'].unique()

array(['C', nan, 'HR', 'AA', 'D', 'B', 'E', 'A', 'NC'], dtype=object)


                
                  # temp list to hold ordered items
    
    sorted_credits = ['AA', 'A', 'B', 'C', 'D', 'E', 'HR', 'NC']


                
                  # change datatype to category
    
    loan_clean['CreditGrade'] = pd.Categorical(loan_clean['CreditGrade'], sorted_credits, ordered=True)


                
                  #Confirm the sorting order of entries in the CreditGrade column
    
    loan_clean['CreditGrade'].unique().sort_values()

['AA', 'A', 'B', 'C', 'D', 'E', 'HR', 'NC', NaN]
    Categories (8, object): ['AA' < 'A' < 'B' < 'C' < 'D' < 'E' < 'HR' < 'NC']


                
                  #assert data type is category
    
    assert loan_clean['CreditGrade'].dtype == 'category'


                
                  # write clean dataframe to CSV file
    
    loan_clean.to_csv('loan_clean.csv', index=False)


                
                  loan_clean.info()

<class 'pandas.core.frame.DataFrame'>
    RangeIndex: 113937 entries, 0 to 113936
    Data columns (total 14 columns):
     #   Column                    Non-Null Count   Dtype         
    ---  ------                    --------------   -----         
     0   ListingKey                113937 non-null  object        
     1   ListingCreationDate       113937 non-null  datetime64[ns]
     2   CreditGrade               28953 non-null   category      
     3   LoanStatus                113937 non-null  object        
     4   BorrowerRate              113937 non-null  float64       
     5   ProsperScore              84853 non-null   float64       
     6   EmploymentStatus          111682 non-null  object        
     7   EmploymentStatusDuration  106312 non-null  float64       
     8   IsBorrowerHomeowner       113937 non-null  bool          
     9   PublicRecordsLast10Years  113240 non-null  float64       
     10  DebtToIncomeRatio         105383 non-null  float64       
     11  IncomeRange               113937 non-null  category      
     12  LoanOriginalAmount        113937 non-null  int64         
     13  MonthlyLoanPayment        113937 non-null  float64       
    dtypes: bool(1), category(2), datetime64[ns](1), float64(6), int64(1), object(3)
    memory usage: 9.9+ MB


                
                  #select a random sample of 1,000 rows 
    
    np.random.seed(1)
    loan_sampled = loan_clean.sample(1000)


                
                  #Function to assist with plotting loan_clean barcharts 
    
    def draw_bar(colname):
        output_bar = sb.countplot(y=loan_clean[colname], color=sb.color_palette()[4])
        for bar in output_bar.containers:
            output_bar.bar_label(bar)
        return output_bar


                
                  # plot bar chart
    
    plt.figure(figsize = (13,6))
    
    draw_bar('CreditGrade')
    
    plt.xlabel('Number of Loans')
    plt.ylabel('Credit Rating')
    plt.title('Count of loans Per credit Grading');


                
                  #plot bar chart
    
    plt.figure(figsize = (13,6))
    draw_bar('EmploymentStatus')
    
    plt.xlabel('Number of Loans')
    plt.ylabel('Employment Status')
    plt.title('Employment Status of Loanees');


                
                  #Summary stats
    
    loan_clean['BorrowerRate'].describe()

count    113937.000000
    mean          0.192764
    std           0.074818
    min           0.000000
    25%           0.134000
    50%           0.184000
    75%           0.250000
    max           0.497500
    Name: BorrowerRate, dtype: float64


                
                  #Create bins
    
    bin = np.arange(0, loan_clean['BorrowerRate'].max()+0.01, 0.02)


                
                  #plot histogram
    
    sb.displot(
        loan_clean['BorrowerRate'], 
        bins=bin, 
        color=sb.color_palette()[4], aspect=2.2
            ).set(title='Distribution of Borrower Rates');


                
                  # Plot bar chart
    
    plt.figure(figsize = (13,6))
    draw_bar('IncomeRange')
    
    plt.xlabel(' of Loans')
    plt.ylabel('Income Range')
    plt.title('No of Loans Per Income Range ');


                
                  #investigate summary stats of the DebtToIncomeRation column
    
    loan_clean.DebtToIncomeRatio.describe()

count    105383.000000
    mean          0.275947
    std           0.551759
    min           0.000000
    25%           0.140000
    50%           0.220000
    75%           0.320000
    max          10.010000
    Name: DebtToIncomeRatio, dtype: float64


                
                  plt.figure(figsize = (12,7))
    
    #Create bins
    bins = np.arange(0, loan_clean.DebtToIncomeRatio.max()+0.01, 0.01)
    
    #plot hist
    plt.hist(data = loan_clean, x= 'DebtToIncomeRatio', bins=bins, color=sb.color_palette()[4]) 
    plt.ylabel('Count of values')
    plt.xlabel('DebtToIncomeRatio')
    plt.xlim(xmin=0, xmax=1);


                
                  plt.figure(figsize = (12,7))
    
    #plot hist
    plt.hist(data = loan_clean, x= 'DebtToIncomeRatio', color=sb.color_palette()[4]) 
    plt.ylabel('Count of values')
    plt.xlabel('DebtToIncomeRatio');


                
                  #Create a month column
    
    loan_clean['month'] = pd.to_datetime(loan_clean['ListingCreationDate']).dt.strftime('%b')


                
                  #month names
    
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']


                
                  #change datatype to categorical
    
    loan_clean['month'] = pd.Categorical(loan_clean['month'], categories=months, ordered=True)


                
                  loan_clean['month'].unique()

['Aug', 'Feb', 'Jan', 'Oct', 'Sep', ..., 'May', 'Jul', 'Nov', 'Jun', 'Mar']
    Length: 12
    Categories (12, object): ['Jan' < 'Feb' < 'Mar' < 'Apr' ... 'Sep' < 'Oct' < 'Nov' < 'Dec']


                
                  loan_clean['ListingKey'].groupby(loan_clean['month']).count()

month
    Jan    11214
    Feb    10124
    Mar     8032
    Apr     7661
    May     8641
    Jun     8672
    Jul     9506
    Aug     9202
    Sep    10074
    Oct    10539
    Nov     9952
    Dec    10320
    Name: ListingKey, dtype: int64


                
                  #Slight modification to function to plot against x axis
    
    def vertical_bar(colname):
        output_bar = sb.countplot(x=loan_clean[colname], color=sb.color_palette()[4])
        for bar in output_bar.containers:
            output_bar.bar_label(bar)
        return output_bar


                
                  plt.figure(figsize = (12,8))
    
    vertical_bar('month')
    plt.ylabel('Count of Loans')
    plt.title('Count of ListingCreationdate per month')
    ;

''


                
                  #plot a clustered barchart
    plt.figure(figsize = (14,6))
    
    
    clustered = sb.countplot(data = loan_clean,  x='EmploymentStatus', hue='IsBorrowerHomeowner', palette='BuPu_r')
    plt.legend()
    plt.title('Home Ownership amongst various Employment Statuses')
    for bar in clustered.containers:
            clustered.bar_label(bar)


                
                  #box plot
    
    plt.figure(figsize=(7,5))
    sb.boxplot(data=loan_clean, x='IncomeRange', y='ProsperScore',color=sb.color_palette()[4] )
    plt.xlabel('Income Ranges')
    plt.ylabel('ProsperScore')
    plt.title('Income Range Box Plot')
    plt.xticks(rotation=60);


                
                  plt.figure(figsize=(9,6))
    sb.violinplot(data=loan_clean, x='IncomeRange', y='ProsperScore', color=sb.color_palette()[4])
    plt.xlabel('Income Ranges')
    plt.ylabel('ProsperScore')
    plt.title('Income Range Box Plot')
    plt.xticks(rotation=60);


                
                  #Limit the Y axis plot
    
    plt.figure(figsize=(10,10))
    sb.regplot(data=loan_sampled, x='BorrowerRate', y='DebtToIncomeRatio',
               x_jitter=0.09, color=sb.color_palette()[4]
              
              );


                
                  #investigate summary stats of the DebtToIncomeRation column
    
    loan_clean['DebtToIncomeRatio'].describe()

count    105383.000000
    mean          0.275947
    std           0.551759
    min           0.000000
    25%           0.140000
    50%           0.220000
    75%           0.320000
    max          10.010000
    Name: DebtToIncomeRatio, dtype: float64


                
                  # Limit the Y axis plot to between 0 and 1 where most of the data points lie
    
    plt.figure(figsize=(12,8))
    plt.title('DebtToIncomeRatio vs. BorrowerRate')
    sb.regplot(data=loan_sampled[loan_sampled.DebtToIncomeRatio <= 1], 
               x='BorrowerRate', y='DebtToIncomeRatio',
               x_jitter=0.07, color=sb.color_palette()[4]
              );


                
                  #plot a scatter plot
    
    plt.figure(figsize=(14,12))
    
    #From loan_clean dataset
    plt.subplot(2, 1, 1)
    plt.title(' LoanOriginalAmount and BorrowerRate')
    sb.regplot(data=loan_clean, x='LoanOriginalAmount', y='BorrowerRate',x_jitter=0.004, color=sb.color_palette()[4]);
    
    
    #From sampled smaller dataset 
    plt.subplot(2, 1, 2)
    sb.regplot(data=loan_sampled, x='LoanOriginalAmount', y='BorrowerRate',x_jitter=0.008, color=sb.color_palette()[4])
    plt.title('Sampled data:LoanOriginalAmount and BorrowerRate Scatter Plot') ;


                
                  plt.figure(figsize=(14,8))
    sb.countplot(data=loan_clean, x='IncomeRange', hue='CreditGrade', palette='BuPu_r')
    plt.title('Credit Grade distribution in Income Ranges')
    plt.ylabel('Count of Credit Grades');


                
                  #From sampled smaller dataset 
    
    plt.figure(figsize=(14,7))
    
    f = sb.FacetGrid(data=loan_sampled, hue='CreditGrade', height=6, aspect=1.5)
    f.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerRate', x_jitter=0.004)
    plt.title('CreditGrade distribution in a LoanOriginalAmount vs BorrowerRate Relationship')
    f.add_legend();

<Figure size 1008x504 with 0 Axes>


                
                  # Re-plot without reg lines
    
    f = sb.FacetGrid(data=loan_sampled, hue='CreditGrade', height=6, aspect=1.5)
    f.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerRate',fit_reg=False, x_jitter=0.008)
    plt.title('CreditGrade distribution in a LoanOriginalAmount vs BorrowerRate Relationship')
    f.add_legend()
    plt.show();


                
                  #From sampled smaller dataset 
    
    f = sb.FacetGrid(data=loan_sampled[loan_sampled.DebtToIncomeRatio <= 2], hue='IsBorrowerHomeowner', height=6, aspect=2.0)
    f.map(sb.regplot, 'DebtToIncomeRatio', 'BorrowerRate',  fit_reg=True, x_jitter=0.09)
    f.add_legend();
    plt.title('IsHomeOwner distribution in a DebtToIncomeRatio vs BorrowerRate Scatter Plot') ;

	ListingKey	ListingNumber	ListingCreationDate	CreditGrade	Term	LoanStatus	ClosedDate	BorrowerAPR	BorrowerRate	LenderYield	...	LP_ServiceFees	PercentFunded	Investors
36509	03EA35335700554889361B0	545929	2011-12-21 05:04:36.947000000	NaN	36	Current	NaN	0.24983	0.2121	0.2021	...	-46.24	1.0	13
93456	B12334290235396512FA71F	378299	2008-08-05 13:52:00.063000000	A	36	Completed	2011-08-18 00:00:00	0.19351	0.1789	0.1689	...	-168.72	1.0	181
54863	C4593390828796080B2AA44	142980	2007-05-25 17:07:18.390000000	C	36	Completed	2010-06-16 00:00:00	0.17722	0.1700	0.1600	...	-58.77	1.0	119
27532	3514358847991633987973E	892327	2013-09-09 14:06:18.307000000	NaN	36	Current	NaN	0.12691	0.0990	0.0890	...	-7.97	1.0	1
88598	AC913581222763893AF2627	800596	2013-06-06 18:13:43.877000000	NaN	36	Current	NaN	0.27285	0.2346	0.2246	...	-24.95	1.0	1

	BorrowerRate	ProsperScore	EmploymentStatusDuration	PublicRecordsLast10Years	DebtToIncomeRatio	LoanOriginalAmount	MonthlyLoanPayment
count	113937.000000	84853.000000	106312.000000	113240.000000	105383.000000	113937.00000	113937.000000
mean	0.192764	5.950067	96.071582	0.312646	0.275947	8337.01385	272.475783
std	0.074818	2.376501	94.480605	0.727868	0.551759	6245.80058	192.697812
min	0.000000	1.000000	0.000000	0.000000	0.000000	1000.00000	0.000000
25%	0.134000	4.000000	26.000000	0.000000	0.140000	4000.00000	131.620000
50%	0.184000	6.000000	67.000000	0.000000	0.220000	6500.00000	217.740000
75%	0.250000	8.000000	137.000000	0.000000	0.320000	12000.00000	371.580000
max	0.497500	11.000000	755.000000	38.000000	10.010000	35000.00000	2251.510000

Part I - Investigating the Relationships between Variables in the Prosper Loan Data¶

by Zadock Mainda¶

Introduction¶

Preliminary Wrangling¶

Structure of the dataset¶

The main features of interest in this dataset are listed below:¶

Assessing Data¶

Quality issues¶

Make copies of the original data¶

Cleaning¶

Issue #1: ListingCreationDate is a string instead of datetime object¶

Define:¶

Code¶

Test¶

Issue #2: Duplicate descriptor in the Employment status ('Employed' & 'Full-time' )¶

Define¶

Code¶

Test¶

Issue #3: IncomeRange is a string¶

Define¶

Code¶

Test¶

Issue #4: CreditGrade is a String object¶

Define¶

Code¶

Test¶

Save clean dataset to CSV¶

Univariate Exploration¶

Question #1: How are credit ratings distributed through the dataset?¶

Question #2: What was the most common Employment Status of loan borrowers?¶

Question #3: What is the distribution of interest rates in the datset?¶

Question #4: In which income range do the least number of loans lie in?¶

Question #5: What is the distribution of the DebtToIncomeRatio in the dataset?¶

Question #6. Loan Listing Creation trends across the year¶

Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?¶

Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?¶

Bivariate Exploration¶

Question #7: How do home ownership rates vary across employment categories?¶

Question #8: What is relationship between ProsperScore and Income range categories?¶

Question #9: What is relationship between ProsperScore and Income range categories?¶

Question #10: What is the correlation between DebtToIncomeRatio and BorrowerRate variables?¶

Question #12: Is there a correlation between LoanOriginalAmount and BorrowerRate?¶

Question #13: How are Credit Grades distributed through individual income ranges?¶

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?¶

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?¶

Multivariate Exploration¶

Question #14: How do Credit Grades influence the relationship between LoanOriginalAmount and BorrowerRate?¶

Question #15: What's the distribution of home owner statuses in a BorrowerRate vs DebtToIncomeRatio scatter plot?¶

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?¶

Were there any interesting or surprising interactions between features?¶

Conclusions¶