import numpy as np

np.set_printoptions(suppress = True, linewidth = 100, precision = 2)

raw_data_np = np.genfromtxt("loan-data.csv", delimiter = ';', skip_header = 1, autostrip = True)
raw_data_np

array([[48010226.  ,         nan,    35000.  , ...,         nan,         nan,     9452.96],
       [57693261.  ,         nan,    30000.  , ...,         nan,         nan,     4679.7 ],
       [59432726.  ,         nan,    15000.  , ...,         nan,         nan,     1969.83],
       ...,
       [50415990.  ,         nan,    10000.  , ...,         nan,         nan,     2185.64],
       [46154151.  ,         nan,         nan, ...,         nan,         nan,     3199.4 ],
       [66055249.  ,         nan,    10000.  , ...,         nan,         nan,      301.9 ]])

np.isnan(raw_data_np).sum()

88005

temporary_fill = np.nanmax(raw_data_np) + 1
temporary_mean = np.nanmean(raw_data_np, axis = 0)

C:\Users\ADMIN\AppData\Local\Temp\ipykernel_25120\3983241459.py:2: RuntimeWarning: Mean of empty slice
  temporary_mean = np.nanmean(raw_data_np, axis = 0)

temporary_mean

array([54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
            440.92,         nan,         nan,         nan,         nan,         nan,     3143.85])

temporary_stats = np.array([np.nanmin(raw_data_np, axis = 0),
                           temporary_mean,
                           np.nanmax(raw_data_np, axis = 0)])

C:\Users\ADMIN\AppData\Local\Temp\ipykernel_25120\2183409543.py:1: RuntimeWarning: All-NaN slice encountered
  temporary_stats = np.array([np.nanmin(raw_data_np, axis = 0),
C:\Users\ADMIN\AppData\Local\Temp\ipykernel_25120\2183409543.py:3: RuntimeWarning: All-NaN slice encountered
  np.nanmax(raw_data_np, axis = 0)])

temporary_stats

array([[  373332.  ,         nan,     1000.  ,         nan,     1000.  ,         nan,        6.  ,
              31.42,         nan,         nan,         nan,         nan,         nan,        0.  ],
       [54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
             440.92,         nan,         nan,         nan,         nan,         nan,     3143.85],
       [68616519.  ,         nan,    35000.  ,         nan,    35000.  ,         nan,       28.99,
            1372.97,         nan,         nan,         nan,         nan,         nan,    41913.62]])

columns_strings = np.argwhere(np.isnan(temporary_mean)).squeeze()
columns_strings

array([ 1,  3,  5,  8,  9, 10, 11, 12], dtype=int64)

columns_numeric = np.argwhere(np.isnan(temporary_mean) == False).squeeze()
columns_numeric

array([ 0,  2,  4,  6,  7, 13], dtype=int64)

loan_data_strings = np.genfromtxt("loan-data.csv",
                                  delimiter = ';',
                                  skip_header = 1,
                                  autostrip = True, 
                                  usecols = columns_strings,
                                  dtype = str)
loan_data_strings

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

loan_data_numeric = np.genfromtxt("loan-data.csv",
                                  delimiter = ';',
                                  autostrip = True,
                                  skip_header = 1,
                                  usecols = columns_numeric,
                                  filling_values = temporary_fill)
loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

header_full = np.genfromtxt("loan-data.csv",
                            delimiter = ';',
                            autostrip = True,
                            skip_footer = raw_data_np.shape[0],
                            dtype = str)
header_full

array(['id', 'issue_d', 'loan_amnt', 'loan_status', 'funded_amnt', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state',
       'total_pymnt'], dtype='<U19')

header_strings, header_numeric = header_full[columns_strings], header_full[columns_numeric]

header_strings

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

def checkpoint(file_name, checkpoint_header, checkpoint_data):
    np.savez(file_name, header = checkpoint_header, data = checkpoint_data)
    checkpoint_variable = np.load(file_name + ".npz")
    return(checkpoint_variable)

checkpoint_test = checkpoint("checkpoint-test", header_strings, loan_data_strings)

checkpoint_test['data']

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

np.array_equal(checkpoint_test['data'], loan_data_strings)

True

header_strings

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

header_strings[0] = "issue_date"

loan_data_strings

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

np.unique(loan_data_strings[:,0])

array(['', 'Apr-15', 'Aug-15', 'Dec-15', 'Feb-15', 'Jan-15', 'Jul-15', 'Jun-15', 'Mar-15',
       'May-15', 'Nov-15', 'Oct-15', 'Sep-15'], dtype='<U69')

loan_data_strings[:,0] = np.chararray.strip(loan_data_strings[:,0], "-15")

np.unique(loan_data_strings[:,0])

array(['', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'],
      dtype='<U69')

months = np.array(['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

for i in range(13):
        loan_data_strings[:,0] = np.where(loan_data_strings[:,0] == months[i],
                                          i,
                                          loan_data_strings[:,0])

np.unique(loan_data_strings[:,0])

array(['0', '1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U69')

header_strings

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

np.unique(loan_data_strings[:,1])

array(['', 'Charged Off', 'Current', 'Default', 'Fully Paid', 'In Grace Period', 'Issued',
       'Late (16-30 days)', 'Late (31-120 days)'], dtype='<U69')

np.unique(loan_data_strings[:,1]).size

9

status_bad = np.array(['','Charged Off','Default','Late (31-120 days)'])

loan_data_strings[:,1] = np.where(np.isin(loan_data_strings[:,1], status_bad),0,1)

np.unique(loan_data_strings[:,1])

array(['0', '1'], dtype='<U69')

header_strings

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

np.unique(loan_data_strings[:,2])

array(['', '36 months', '60 months'], dtype='<U69')

loan_data_strings[:,2] = np.chararray.strip(loan_data_strings[:,2], " months")
loan_data_strings[:,2]

array(['36', '36', '36', ..., '36', '36', '36'], dtype='<U69')

header_strings[2] = "term_months"

loan_data_strings[:,2] = np.where(loan_data_strings[:,2] == '', 
                                  '60', 
                                  loan_data_strings[:,2])
loan_data_strings[:,2]

array(['36', '36', '36', ..., '36', '36', '36'], dtype='<U69')

np.unique(loan_data_strings[:,2])

array(['36', '60'], dtype='<U69')

header_strings

array(['issue_date', 'loan_status', 'term_months', 'grade', 'sub_grade', 'verification_status',
       'url', 'addr_state'], dtype='<U19')

np.unique(loan_data_strings[:,3])

array(['', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='<U69')

np.unique(loan_data_strings[:,4])

array(['', 'A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4',
       'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4',
       'F5', 'G1', 'G2', 'G3', 'G4', 'G5'], dtype='<U69')

for i in np.unique(loan_data_strings[:,3])[1:]:
    loan_data_strings[:,4] = np.where((loan_data_strings[:,4] == '') & (loan_data_strings[:,3] == i),
                                      i + '5',
                                      loan_data_strings[:,4])

np.unique(loan_data_strings[:,4], return_counts = True)

(array(['', 'A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4',
        'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4',
        'F5', 'G1', 'G2', 'G3', 'G4', 'G5'], dtype='<U69'),
 array([  9, 285, 278, 239, 323, 592, 509, 517, 530, 553, 633, 629, 567, 586, 564, 577, 391, 267,
        250, 255, 288, 235, 162, 171, 139, 160,  94,  52,  34,  43,  24,  19,  10,   3,   7,   5],
       dtype=int64))

loan_data_strings[:,4] = np.where(loan_data_strings[:,4] == '',
                                  'H1',
                                  loan_data_strings[:,4])

np.unique(loan_data_strings[:,4])

array(['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5',
       'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5',
       'G1', 'G2', 'G3', 'G4', 'G5', 'H1'], dtype='<U69')

loan_data_strings = np.delete(loan_data_strings, 3, axis = 1)

loan_data_strings[:,3]

array(['C3', 'A5', 'B5', ..., 'A5', 'D2', 'A4'], dtype='<U69')

header_strings = np.delete(header_strings, 3)

header_strings[3]

'sub_grade'

np.unique(loan_data_strings[:,3])

array(['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5',
       'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5',
       'G1', 'G2', 'G3', 'G4', 'G5', 'H1'], dtype='<U69')

keys = list(np.unique(loan_data_strings[:,3]))                         
values = list(range(1, np.unique(loan_data_strings[:,3]).shape[0] + 1)) 
dict_sub_grade = dict(zip(keys, values))

dict_sub_grade

{'A1': 1,
 'A2': 2,
 'A3': 3,
 'A4': 4,
 'A5': 5,
 'B1': 6,
 'B2': 7,
 'B3': 8,
 'B4': 9,
 'B5': 10,
 'C1': 11,
 'C2': 12,
 'C3': 13,
 'C4': 14,
 'C5': 15,
 'D1': 16,
 'D2': 17,
 'D3': 18,
 'D4': 19,
 'D5': 20,
 'E1': 21,
 'E2': 22,
 'E3': 23,
 'E4': 24,
 'E5': 25,
 'F1': 26,
 'F2': 27,
 'F3': 28,
 'F4': 29,
 'F5': 30,
 'G1': 31,
 'G2': 32,
 'G3': 33,
 'G4': 34,
 'G5': 35,
 'H1': 36}

for i in np.unique(loan_data_strings[:,3]):
        loan_data_strings[:,3] = np.where(loan_data_strings[:,3] == i, 
                                          dict_sub_grade[i],
                                          loan_data_strings[:,3])

np.unique(loan_data_strings[:,3])

array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36',
       '4', '5', '6', '7', '8', '9'], dtype='<U69')

header_strings

array(['issue_date', 'loan_status', 'term_months', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

np.unique(loan_data_strings[:,4])

array(['', 'Not Verified', 'Source Verified', 'Verified'], dtype='<U69')

loan_data_strings[:,4] = np.where((loan_data_strings[:,4] == '') | (loan_data_strings[:,4] == 'Not Verified'), 0, 1)

np.unique(loan_data_strings[:,4])

array(['0', '1'], dtype='<U69')

loan_data_strings[:,5]

array(['https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226',
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261',
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', ...,
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990',
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151',
       'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249'], dtype='<U69')

np.chararray.strip(loan_data_strings[:,5], "https://www.lendingclub.com/browse/loanDetail.action?loan_id=")

chararray(['48010226', '57693261', '59432726', ..., '50415990', '46154151', '66055249'],
          dtype='<U69')

loan_data_strings[:,5] = np.chararray.strip(loan_data_strings[:,5], "https://www.lendingclub.com/browse/loanDetail.action?loan_id=")

header_full

array(['id', 'issue_d', 'loan_amnt', 'loan_status', 'funded_amnt', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state',
       'total_pymnt'], dtype='<U19')

loan_data_numeric[:,0].astype(dtype = np.int32)

array([48010226, 57693261, 59432726, ..., 50415990, 46154151, 66055249])

loan_data_strings[:,5].astype(dtype = np.int32)

array([48010226, 57693261, 59432726, ..., 50415990, 46154151, 66055249])

np.array_equal(loan_data_numeric[:,0].astype(dtype = np.int32), loan_data_strings[:,5].astype(dtype = np.int32))

True

loan_data_strings = np.delete(loan_data_strings, 5, axis = 1)
header_strings = np.delete(header_strings, 5)

loan_data_strings[:,5]

array(['CA', 'NY', 'PA', ..., 'CA', 'OH', 'IL'], dtype='<U69')

header_strings

array(['issue_date', 'loan_status', 'term_months', 'sub_grade', 'verification_status',
       'addr_state'], dtype='<U19')

loan_data_numeric[:,0]

array([48010226., 57693261., 59432726., ..., 50415990., 46154151., 66055249.])

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

header_strings

array(['issue_date', 'loan_status', 'term_months', 'sub_grade', 'verification_status',
       'addr_state'], dtype='<U19')

header_strings[5] = "state_address"

states_names, states_count = np.unique(loan_data_strings[:,5], return_counts = True)
states_count_sorted = np.argsort(-states_count)
states_names[states_count_sorted], states_count[states_count_sorted]

(array(['CA', 'NY', 'TX', 'FL', '', 'IL', 'NJ', 'GA', 'PA', 'OH', 'MI', 'NC', 'VA', 'MD', 'AZ',
        'WA', 'MA', 'CO', 'MO', 'MN', 'IN', 'WI', 'CT', 'TN', 'NV', 'AL', 'LA', 'OR', 'SC', 'KY',
        'KS', 'OK', 'UT', 'AR', 'MS', 'NH', 'NM', 'WV', 'HI', 'RI', 'MT', 'DE', 'DC', 'WY', 'AK',
        'NE', 'SD', 'VT', 'ND', 'ME'], dtype='<U69'),
 array([1336,  777,  758,  690,  500,  389,  341,  321,  320,  312,  267,  261,  242,  222,  220,
         216,  210,  201,  160,  156,  152,  148,  143,  143,  130,  119,  116,  108,  107,   84,
          84,   83,   74,   74,   61,   58,   57,   49,   44,   40,   28,   27,   27,   27,   26,
          25,   24,   17,   16,   10], dtype=int64))

loan_data_strings[:,5] = np.where(loan_data_strings[:,5] == '', 
                                  0, 
                                  loan_data_strings[:,5])

states_west = np.array(['WA', 'OR','CA','NV','ID','MT', 'WY','UT','CO', 'AZ','NM','HI','AK'])
states_south = np.array(['TX','OK','AR','LA','MS','AL','TN','KY','FL','GA','SC','NC','VA','WV','MD','DE','DC'])
states_midwest = np.array(['ND','SD','NE','KS','MN','IA','MO','WI','IL','IN','MI','OH'])
states_east = np.array(['PA','NY','NJ','CT','MA','VT','NH','ME','RI'])

loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_west), 1, loan_data_strings[:,5])
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_south), 2, loan_data_strings[:,5])
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_midwest), 3, loan_data_strings[:,5])
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_east), 4, loan_data_strings[:,5])

np.unique(loan_data_strings[:,5])

array(['0', '1', '2', '3', '4'], dtype='<U69')

loan_data_strings

array([['5', '1', '36', '13', '1', '1'],
       ['0', '1', '36', '5', '1', '4'],
       ['9', '1', '36', '10', '1', '4'],
       ...,
       ['6', '1', '36', '5', '1', '1'],
       ['4', '1', '36', '17', '1', '3'],
       ['12', '1', '36', '4', '0', '3']], dtype='<U69')

loan_data_strings = loan_data_strings.astype(int)

loan_data_strings

array([[ 5,  1, 36, 13,  1,  1],
       [ 0,  1, 36,  5,  1,  4],
       [ 9,  1, 36, 10,  1,  4],
       ...,
       [ 6,  1, 36,  5,  1,  1],
       [ 4,  1, 36, 17,  1,  3],
       [12,  1, 36,  4,  0,  3]])

checkpoint_strings = checkpoint("Checkpoint-Strings", header_strings, loan_data_strings)

checkpoint_strings["header"]

array(['issue_date', 'loan_status', 'term_months', 'sub_grade', 'verification_status',
       'state_address'], dtype='<U19')

checkpoint_strings["data"]

array([[ 5,  1, 36, 13,  1,  1],
       [ 0,  1, 36,  5,  1,  4],
       [ 9,  1, 36, 10,  1,  4],
       ...,
       [ 6,  1, 36,  5,  1,  1],
       [ 4,  1, 36, 17,  1,  3],
       [12,  1, 36,  4,  0,  3]])

np.array_equal(checkpoint_strings['data'], loan_data_strings)

True

loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

np.isnan(loan_data_numeric).sum()

0

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

temporary_fill

68616520.0

np.isin(loan_data_numeric[:,0], temporary_fill)

array([False, False, False, ..., False, False, False])

np.isin(loan_data_numeric[:,0], temporary_fill).sum()

0

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

temporary_stats[:, columns_numeric]

array([[  373332.  ,     1000.  ,     1000.  ,        6.  ,       31.42,        0.  ],
       [54015809.19,    15273.46,    15311.04,       16.62,      440.92,     3143.85],
       [68616519.  ,    35000.  ,    35000.  ,       28.99,     1372.97,    41913.62]])

loan_data_numeric[:,2]

array([35000., 30000., 15000., ..., 10000., 10000., 10000.])

loan_data_numeric[:,2] = np.where(loan_data_numeric[:,2] == temporary_fill, 
                                  temporary_stats[0, columns_numeric[2]],
                                  loan_data_numeric[:,2])
loan_data_numeric[:,2]

array([35000., 30000., 15000., ..., 10000., 10000., 10000.])

temporary_stats[0,columns_numeric[3]]

6.0

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

for i in [1,3,4,5]:
    loan_data_numeric[:,i] = np.where(loan_data_numeric[:,i] == temporary_fill,
                                      temporary_stats[2, columns_numeric[i]],
                                      loan_data_numeric[:,i])

loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  ,       28.99,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  ,       28.99,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  ,       28.99,     1372.97,     2185.64],
       [46154151.  ,    35000.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  ,       28.99,      309.97,      301.9 ]])

EUR_USD = np.genfromtxt("EUR-USD.csv", delimiter = ',', autostrip = True, skip_header = 1, usecols = 3)
EUR_USD

array([1.13, 1.12, 1.08, 1.11, 1.1 , 1.12, 1.09, 1.13, 1.13, 1.1 , 1.06, 1.09])

loan_data_strings[:,0]

array([ 5,  0,  9, ...,  6,  4, 12])

exchange_rate = loan_data_strings[:,0]

for i in range(1,13):
    exchange_rate = np.where(exchange_rate == i,
                             EUR_USD[i-1],
                             exchange_rate)    

exchange_rate = np.where(exchange_rate == 0,
                         np.mean(EUR_USD),
                         exchange_rate)

exchange_rate

array([1.1 , 1.11, 1.13, ..., 1.12, 1.11, 1.09])

exchange_rate.shape

(10000,)

loan_data_numeric.shape

(10000, 6)

exchange_rate = np.reshape(exchange_rate, (10000,1))

loan_data_numeric = np.hstack((loan_data_numeric, exchange_rate))

header_numeric = np.concatenate((header_numeric, np.array(['exchange_rate'])))
header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt', 'exchange_rate'],
      dtype='<U19')

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt', 'exchange_rate'],
      dtype='<U19')

columns_dollar = np.array([1,2,4,5])

loan_data_numeric[:,6]

array([1.1 , 1.11, 1.13, ..., 1.12, 1.11, 1.09])

for i in columns_dollar:
    loan_data_numeric = np.hstack((loan_data_numeric, np.reshape(loan_data_numeric[:,i] / loan_data_numeric[:,6], (10000,1))))

loan_data_numeric.shape

(10000, 11)

loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  , ...,    31933.3 ,     1081.04,     8624.69],
       [57693261.  ,    30000.  ,    30000.  , ...,    27132.46,      848.86,     4232.39],
       [59432726.  ,    15000.  ,    15000.  , ...,    13326.3 ,      439.64,     1750.04],
       ...,
       [50415990.  ,    10000.  ,    10000.  , ...,     8910.3 ,     1223.36,     1947.47],
       [46154151.  ,    35000.  ,    10000.  , ...,     8997.4 ,      318.78,     2878.63],
       [66055249.  ,    10000.  ,    10000.  , ...,     9145.8 ,      283.49,      276.11]])

header_additional = np.array([column_name + '_EUR' for column_name in header_numeric[columns_dollar]])

header_additional

array(['loan_amnt_EUR', 'funded_amnt_EUR', 'installment_EUR', 'total_pymnt_EUR'], dtype='<U15')

header_numeric = np.concatenate((header_numeric, header_additional))

header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt', 'exchange_rate',
       'loan_amnt_EUR', 'funded_amnt_EUR', 'installment_EUR', 'total_pymnt_EUR'], dtype='<U19')

header_numeric[columns_dollar] = np.array([column_name + '_USD' for column_name in header_numeric[columns_dollar]])

header_numeric

array(['id', 'loan_amnt_USD', 'funded_amnt_USD', 'int_rate', 'installment_USD', 'total_pymnt_USD',
       'exchange_rate', 'loan_amnt_EUR', 'funded_amnt_EUR', 'installment_EUR', 'total_pymnt_EUR'],
      dtype='<U19')

columns_index_order = [0,1,7,2,8,3,4,9,5,10,6]

header_numeric = header_numeric[columns_index_order]

loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  , ...,    31933.3 ,     1081.04,     8624.69],
       [57693261.  ,    30000.  ,    30000.  , ...,    27132.46,      848.86,     4232.39],
       [59432726.  ,    15000.  ,    15000.  , ...,    13326.3 ,      439.64,     1750.04],
       ...,
       [50415990.  ,    10000.  ,    10000.  , ...,     8910.3 ,     1223.36,     1947.47],
       [46154151.  ,    35000.  ,    10000.  , ...,     8997.4 ,      318.78,     2878.63],
       [66055249.  ,    10000.  ,    10000.  , ...,     9145.8 ,      283.49,      276.11]])

loan_data_numeric = loan_data_numeric[:,columns_index_order]

header_numeric

array(['id', 'loan_amnt_USD', 'loan_amnt_EUR', 'funded_amnt_USD', 'funded_amnt_EUR', 'int_rate',
       'installment_USD', 'installment_EUR', 'total_pymnt_USD', 'total_pymnt_EUR', 'exchange_rate'],
      dtype='<U19')

loan_data_numeric[:,5]

array([13.33, 28.99, 28.99, ..., 28.99, 16.55, 28.99])

loan_data_numeric[:,5] = loan_data_numeric[:,5]/100

loan_data_numeric[:,5]

array([0.13, 0.29, 0.29, ..., 0.29, 0.17, 0.29])

checkpoint_numeric = checkpoint("Checkpoint-Numeric", header_numeric, loan_data_numeric)

checkpoint_numeric['header'], checkpoint_numeric['data']

(array(['id', 'loan_amnt_USD', 'loan_amnt_EUR', 'funded_amnt_USD', 'funded_amnt_EUR', 'int_rate',
        'installment_USD', 'installment_EUR', 'total_pymnt_USD', 'total_pymnt_EUR', 'exchange_rate'],
       dtype='<U19'),
 array([[48010226.  ,    35000.  ,    31933.3 , ...,     9452.96,     8624.69,        1.1 ],
        [57693261.  ,    30000.  ,    27132.46, ...,     4679.7 ,     4232.39,        1.11],
        [59432726.  ,    15000.  ,    13326.3 , ...,     1969.83,     1750.04,        1.13],
        ...,
        [50415990.  ,    10000.  ,     8910.3 , ...,     2185.64,     1947.47,        1.12],
        [46154151.  ,    35000.  ,    31490.9 , ...,     3199.4 ,     2878.63,        1.11],
        [66055249.  ,    10000.  ,     9145.8 , ...,      301.9 ,      276.11,        1.09]]))

checkpoint_strings['data'].shape

(10000, 6)

checkpoint_numeric['data'].shape

(10000, 11)

loan_data = np.hstack((checkpoint_numeric['data'], checkpoint_strings['data']))

loan_data

array([[48010226.  ,    35000.  ,    31933.3 , ...,       13.  ,        1.  ,        1.  ],
       [57693261.  ,    30000.  ,    27132.46, ...,        5.  ,        1.  ,        4.  ],
       [59432726.  ,    15000.  ,    13326.3 , ...,       10.  ,        1.  ,        4.  ],
       ...,
       [50415990.  ,    10000.  ,     8910.3 , ...,        5.  ,        1.  ,        1.  ],
       [46154151.  ,    35000.  ,    31490.9 , ...,       17.  ,        1.  ,        3.  ],
       [66055249.  ,    10000.  ,     9145.8 , ...,        4.  ,        0.  ,        3.  ]])

np.isnan(loan_data).sum()

0

header_full = np.concatenate((checkpoint_numeric['header'], checkpoint_strings['header']))

loan_data = loan_data[np.argsort(loan_data[:,0])]

loan_data

array([[  373332.  ,     9950.  ,     9038.08, ...,       21.  ,        0.  ,        1.  ],
       [  575239.  ,    12000.  ,    10900.2 , ...,       25.  ,        1.  ,        2.  ],
       [  707689.  ,    10000.  ,     8924.3 , ...,       13.  ,        1.  ,        0.  ],
       ...,
       [68614880.  ,     5600.  ,     5121.65, ...,        8.  ,        1.  ,        1.  ],
       [68615915.  ,     4000.  ,     3658.32, ...,       10.  ,        1.  ,        2.  ],
       [68616519.  ,    21600.  ,    19754.93, ...,        3.  ,        0.  ,        2.  ]])

np.argsort(loan_data[:,0])

array([   0,    1,    2, ..., 9997, 9998, 9999], dtype=int64)

loan_data = np.vstack((header_full, loan_data))

np.savetxt("loan-data-preprocessed.csv", 
           loan_data, 
           fmt = '%s',
           delimiter = ',')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter

df_loan = pd.read_csv("loan-data-preprocessed.csv")
df_loan

df_grouped = df_loan.groupby("state_address", as_index=False)["total_pymnt_USD"].sum()
df_grouped

df_grouped_0 = df_grouped[df_grouped['state_address'] != 0]
df_grouped_0

state_mapping = {
    1.0: 'West',
    2.0: 'South',
    3.0: 'Midwest',
    4.0: 'East'
}
df_grouped_0.loc[:,'state_address'] = df_grouped_0['state_address'].map(state_mapping)
df_grouped_0

C:\Users\ADMIN\AppData\Local\Temp\ipykernel_25120\2851085610.py:7: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '['West' 'South' 'Midwest' 'East']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_grouped_0.loc[:,'state_address'] = df_grouped_0['state_address'].map(state_mapping)

import matplotlib.ticker as ticker
plt.figure(figsize=(10, 6))
sns.barplot(x="state_address", y="total_pymnt_USD", data=df_grouped_0, palette="viridis")
plt.title("Total Payment USD by State Address", fontsize=14)
plt.xlabel("State Address", fontsize=12)
plt.ylabel("Total Payment (Million USD)", fontsize=12)
plt.ticklabel_format(style='plain', axis='y')
plt.grid(axis="y", alpha=0.05)
# Định dạng trục y (hiển thị giá trị theo triệu)
def millions_formatter(x, pos):
    return f'{x / 1e7:.2f}'  # Chia giá trị cho 10 triệu và định dạng thành 2 chữ số thập phân

formatter = ticker.FuncFormatter(millions_formatter)
plt.gca().yaxis.set_major_formatter(formatter)
plt.show()

C:\Users\ADMIN\AppData\Local\Temp\ipykernel_25120\3046859625.py:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="state_address", y="total_pymnt_USD", data=df_grouped_0, palette="viridis")

	id	loan_amnt_USD	loan_amnt_EUR	funded_amnt_USD	funded_amnt_EUR	int_rate	installment_USD	installment_EUR	total_pymnt_USD	total_pymnt_EUR	exchange_rate	issue_date	loan_status	term_months	sub_grade	verification_status	state_address
0	373332.0	9950.0	9038.082814	1000.0	908.350032	0.1825	360.97	327.887111	1072.82	974.496081	1.100897	10.0	1.0	36.0	21.0	0.0	1.0
1	575239.0	12000.0	10900.200379	12000.0	10900.200379	0.2099	324.58	294.832253	959.75	871.788943	1.100897	10.0	1.0	60.0	25.0	1.0	2.0
2	707689.0	10000.0	8924.299805	10000.0	8924.299805	0.1366	340.13	303.542209	3726.25	3325.417215	1.120536	2.0	1.0	36.0	13.0	1.0	0.0
3	709828.0	27200.0	24707.120859	27200.0	24707.120859	0.2899	553.87	503.107832	41913.62	38072.238051	1.100897	10.0	1.0	60.0	6.0	0.0	4.0
4	849994.0	11400.0	10526.076489	11400.0	10526.076489	0.2899	376.09	347.258957	3753.60	3465.849185	1.083025	3.0	0.0	36.0	10.0	0.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	68603178.0	14000.0	12804.119629	14000.0	12804.119629	0.2899	421.61	385.596063	41913.62	38333.357469	1.093398	12.0	1.0	36.0	1.0	0.0	1.0
9996	68604253.0	20000.0	18291.599470	20000.0	18291.599470	0.2899	631.26	577.337754	0.00	0.000000	1.093398	12.0	1.0	36.0	6.0	0.0	2.0
9997	68614880.0	5600.0	5121.647852	5600.0	5121.647852	0.2899	180.18	164.789020	0.00	0.000000	1.093398	12.0	1.0	36.0	8.0	1.0	1.0
9998	68615915.0	4000.0	3658.319894	4000.0	3658.319894	0.2899	131.87	120.605661	0.00	0.000000	1.093398	12.0	1.0	36.0	10.0	1.0	2.0
9999	68616519.0	21600.0	19754.927428	21600.0	19754.927428	0.2899	666.85	609.887655	0.00	0.000000	1.093398	12.0	1.0	36.0	3.0	0.0	2.0

Importing the Packages¶

Importing the Data¶

Checking for Incomplete Data¶

Splitting the Dataset¶

Splitting the Columns¶

Re-importing the Dataset¶

The Names of the Columns¶

Creating Checkpoints:¶

Manipulating String Columns¶

Issue Date¶

Loan Status¶

Term¶

Grade and Subgrade¶

Filling Sub Grade¶

Removing Grade¶

Converting Sub Grade¶

Verification Status¶

URL¶

State Address¶

Converting to Numbers¶

Checkpoint 1: Strings¶

Manipulating Numeric Columns¶

Substitute "Filler" Values¶

ID¶

Temporary Stats¶

Funded Amount¶

Loaned Amount, Interest Rate, Total Payment, Installment¶

Currency Change¶

The Exchange Rate¶

From USD to EUR¶

Expanding the header¶

Interest Rate¶

Checkpoint 2: Numeric¶

Creating the "Complete" Dataset¶

Sorting the New Dataset¶

Storing the New Dataset¶

	state_address	total_pymnt_USD
0	0.0	1.475132e+06
1	1.0	1.315233e+07
2	2.0	1.778518e+07
3	3.0	8.270415e+06
4	4.0	1.014033e+07

	state_address	total_pymnt_USD
1	West	1.315233e+07
2	South	1.778518e+07
3	Midwest	8.270415e+06
4	East	1.014033e+07