import csv

#github autopilot
#import csv reader and read file and return array

# Number,Race,Race Grouping,Gender,Age,Incarcerated/Supervised,Supervision Type,
# Sentence Type,Aggregate Sentence Length,Province,Marital Status,Religion

# 0-Number,1-Race,2-Race Grouping,3-Gender,4-Age,5-Incarcerated/Supervised,6-Supervision Type,
# 7-Sentence Type,8-Aggregate Sentence Length,9-Province,10-Marital Status,11-Religion
#12 columns

#https://docs.python.org/3/library/csv.html
def input_filename_csv_return_list(filename):
    #define csv reader
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='\'' )
        return list(reader)

def input_list_with_header_return_list_dict(list_with_header):
    #define csv header
    #loop through list_with_header skipping first row
    #define empty dictionary
    list_dict = []

    for row in list_with_header[1:]:
        #loop through row
        empty_dict = {}
        for index, item in enumerate(row):
            #add item to dictionary
            if(list_with_header[0][index] == 'Age' or list_with_header[0][index] == 'Aggregate Sentence Length'):
                empty_dict[list_with_header[0][index]] = int(item)
            else:
                empty_dict[list_with_header[0][index]] = item
            #add dictionary to list
            list_dict.append(empty_dict)
   
    return list_dict


def input_list_number_of_columns(data):
    #first row is header
    return len(data[0])

def input_list_check_all_rows_same_number_of_columns(data):
    #define first row length
    first_row_length = len(data[0])
    #loop through data
    for row in data:
        #if row length is not equal to first row length
        if len(row) != first_row_length:
            #return false
            return False
    #return true
    return True

def input_list_num_column_check_all_rows_same_number_of_columns(data,num_columns):
    #define first row length
    first_row_length = num_columns
    #loop through data
    for row in data:
        #if row length is not equal to first row length
        if len(row) != first_row_length:
            #return false
            return False
    #return true
    return True

def input_list_column_return_set(data, column):
    #define empty set
    unique_items = set()
    #loop through data
    for row in data:
        #add item to set
        unique_items.add(row[column])
    return unique_items

#one hot encoding vanilla python using integer for each unique value
def one_hot_encoding(data):
    #define empty dictionary
    one_hot_dict = {}
    #loop through data
    for row in data:
        #loop through row
        for item in row:
            #if item is not in dictionary
            if item not in one_hot_dict:
                #add item to dictionary
                one_hot_dict[item] = 1
            else:
                #increment item in dictionary
                one_hot_dict[item] += 1
    return one_hot_dict

list_data =  input_filename_csv_return_list('OpenDataFile20130414v2.csv') 


print ( input_list_column_return_set(list_data, 1) )
print(input_list_check_all_rows_same_number_of_columns(list_data))
print(input_list_number_of_columns(list_data))
dict_data = input_list_with_header_return_list_dict(list_data)
#go through each row of dict_data
#print(list(dict_data) )

#import decisiontree classifier
from sklearn import tree
from sklearn.tree import export_text
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

#for each column in csv generate one hot encoding
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

#The class DictVectorizer can be used to convert feature arrays represented as lists of standard Python dict objects 
# to the NumPy/SciPy representation used by scikit-learn estimators.

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer() #sparse = False  default is True
print(dict_data[0])
#AttributeError: 'DictVectorizer' object has no attribute 'feature_names_'. Did you mean: 'get_feature_names'? use fit_transform
X = vec.fit_transform(dict_data).toarray()
#vec.inverse_transform(X, dict_type=<class 'dict'>)
#for k in vec.get_feature_names_out():
#    print(k)
#Province=PRINCE EDWARD I
#Province=QUEBEC
##Province=SASKATCHEWAN
#Race Grouping=Aboriginal
#Race Grouping=Non Aboriginal
#Race=
#Race=ARAB
#Race=ARAB/WEST ASIAN
#Race=ASI-E/SOUTHEAST
#Race=LATIN AMERICAN
#Race=METIS
#Religion=KRISHNA

# 0-Number,1-Race,2-Race Grouping,3-Gender,4-Age,5-Incarcerated/Supervised,6-Supervision Type,
# 7-Sentence Type,8-Aggregate Sentence Length,9-Province,10-Marital Status,11-Religion
#12 columns

#if given 1 Race, 3 Gender, 4Age, 10Marital Status, can guess 11 Religion
#remove first column of X
#extract specific columns from X
#numpy.core._exceptions._ArrayMemoryError: Unable to allocate 48.0 GiB for an array with shape (277224, 23233) and data type float64
#numpy.core._exceptions.MemoryError: Unable to allocate array with shape (156816, 36, 53806) and data type uint8
#numpy.core._exceptions._ArrayMemoryError: Unable to allocate XX GiB for an array with shape (XX, XX) and data type #float64
#cat /proc/sys/vm/overcommit_memory
#default 0 is bad
#sudo echo 1 > /proc/sys/vm/overcommit_memory
#turn off
#sudo echo 0 > /proc/sys/vm/overcommit_memory
#sysctl -w vm.overcommit_memory=1
#1	-	Always overcommit. Appropriate for some scientific
#		applications. Classic example is code using sparse arrays
#		and just relying on the virtual memory consisting almost
#		entirely of zero pages.
#sparse = false can help here https://www.kernel.org/doc/Documentation/vm/overcommit-accounting
X_mod = X[:, [1,3,4,10]]
Y_mod = X[:, [11]]
clf = ExtraTreesClassifier(n_estimators=10, random_state=0)
clf = clf.fit(X_mod, Y_mod)
print(clf.feature_importances_)
#[0.82982995 0.00443811 0.00828155 0.1574504 ]
#looks like Race, and marital status are the most important features in determining religion

X_race = X[:, [1]]
Y_religion = X[:, [11]]
#clf2 = ExtraTreesClassifier(n_estimators=10, random_state=0)
clf2 = DecisionTreeClassifier()
clf2 = clf2.fit(X_race, Y_religion)
#The decision tree estimator to be exported. 
# It can be an instance of DecisionTreeClassifier or DecisionTreeRegressor.
#AttributeError: 'ExtraTreesClassifier' object has no attribute 'tree_'
from sklearn.tree import export_text
r = export_text(clf2 )
print(r)

#create a decision tree classifier with dictvectorizer