import csv

#github autopilot
#import csv reader and read file and return array

# Number,Race,Race Grouping,Gender,Age,Incarcerated/Supervised,Supervision Type,
# Sentence Type,Aggregate Sentence Length,Province,Marital Status,Religion

# 0-Number,1-Race,2-Race Grouping,3-Gender,4-Age,5-Incarcerated/Supervised,6-Supervision Type,
# 7-Sentence Type,8-Aggregate Sentence Length,9-Province,10-Marital Status,11-Religion
#12 columns

#https://docs.python.org/3/library/csv.html
def input_filename_csv_return_list(filename):
    #define csv reader
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='\'' )
        return list(reader)

def input_list_number_of_columns(data):
    #first row is header
    return len(data[0])

def input_list_check_all_rows_same_number_of_columns(data):
    #define first row length
    first_row_length = len(data[0])
    #loop through data
    for row in data:
        #if row length is not equal to first row length
        if len(row) != first_row_length:
            #return false
            return False
    #return true
    return True

def input_list_num_column_check_all_rows_same_number_of_columns(data,num_columns):
    #define first row length
    first_row_length = num_columns
    #loop through data
    for row in data:
        #if row length is not equal to first row length
        if len(row) != first_row_length:
            #return false
            return False
    #return true
    return True

def input_list_column_return_set(data, column):
    #define empty set
    unique_items = set()
    #loop through data
    for row in data:
        #add item to set
        unique_items.add(row[column])
    return unique_items

#one hot encoding vanilla python using integer for each unique value
def one_hot_encoding(data):
    #define empty dictionary
    one_hot_dict = {}
    #loop through data
    for row in data:
        #loop through row
        for item in row:
            #if item is not in dictionary
            if item not in one_hot_dict:
                #add item to dictionary
                one_hot_dict[item] = 1
            else:
                #increment item in dictionary
                one_hot_dict[item] += 1
    return one_hot_dict

list_data =  input_filename_csv_return_list('OpenDataFile20130414v2.csv') 


print ( input_list_column_return_set(list_data, 1) )
print(input_list_check_all_rows_same_number_of_columns(list_data))
print(input_list_number_of_columns(list_data))

#import decisiontree classifier
from sklearn import tree
from sklearn.tree import export_text
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

#for each column in csv generate one hot encoding
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')


