Decisions Trees in Python

Decisions Trees in Python Homework Sample

You need to implement a decision tree to perform predictions. One dataset consists of voting records, and you need to determine if the representative is a Democrat or a Republican. There should be a program inspect, that checks the entropy at the root, which is the single vote that is most useful in distinguishing between Democrats and Republicans. You should also have a method to display the tree after generating the decision tree recursively. The tree should be generated with a maximum as specified by the user. For more Python programming assignments contact us for details.

Solution:

decisionTree.py

import numpy as np
import sys

def entropy(s):
res = 0
val, counts = np.unique(s, return_counts=True)
freqs = counts.astype(‘float’)/len(s)
for p in freqs:
if p != 0.0:
res -= p * np.log2(p)
return res

def mutual_information(y, x):
res = entropy(y)

# We partition x, according to attribute values x_i
val, counts = np.unique(x, return_counts=True)
freqs = counts.astype(‘float’)/len(x)

# We calculate a weighted average of the entropy
for p, v in zip(freqs, val):
res -= p * entropy(y[x == v])

return res

def get_split(dataset, index):
left, right = list(), list()
val = np.unique([row[index] for row in dataset])

for row in dataset:
if row[index] == val[0]:
left.append(list(row))
else:
right.append(list(row))

return left, right, val

def recursive_split(node, max_depth, depth):
# If there could be no split, just return the original set
if len(node) == 0:
return nodes[:, -1]

if depth >= max_depth:
return node[:, -1]
# We get attribute that gives the highest mutual information
gain = 0
for indx in range(node.shape[1]-1):
temp = mutual_information(node[:, -1], node[:, indx])
if temp > gain:
gain = temp
selected_attr = indx

# If there’s no gain at all, nothing has to be done, just return the original set
if gain < 1e-6:
return node[:, -1]

# We split using the selected attribute
args = get_split(node, selected_attr)

res = {}

for i, group in enumerate(args[0:-1]):
res[‘depth’] = depth
res[‘left’] = args[0]
res[‘right’] = args[1]
res[‘values’] = args[-1]
res[‘selected_attr’] = selected_attr
res[f’node{i}’] = recursive_split(np.array(group), max_depth, depth+1)

return res

def predict(node, row):
if row[node[‘selected_attr’]] == node[‘values’][0]:
if isinstance(node[‘node0’], dict):
return predict(node[‘node0’], row)
else:
prediction = max(set(node[‘node0’]), key=list(node[‘node1’]).count)
return prediction
else:
if isinstance(node[‘node1’], dict):
return predict(node[‘node1’], row)
else:
prediction = max(set(node[‘node1’]), key=list(node[‘node1’]).count)
return prediction

def decisionTree(train_data, test_data, max_depth):
tree = recursive_split(train_data, max_depth, 0)
predictions = list()
for row in test_data:
prediction = predict(tree, row)
predictions.append(prediction)

return predictions

def error(prediction, ground_truth):
correct = 0
for i in range(len(ground_truth)):
if prediction[i] == ground_truth[i]:
correct += 1

return (len(ground_truth)-correct) / len(ground_truth)

def print_tree(tree, train_data, header_train):

if isinstance(tree, dict):
val, counts = np.unique(train_data[:, -1], return_counts=True)
val_l, count_l = np.unique([row[-1] for row in tree[‘left’]], return_counts=True)
val_r, count_r = np.unique([row[-1] for row in tree[‘right’]], return_counts=True)
if tree[‘depth’] == 0:
print(f'[{dict((key, value) for (key, value) in zip(val, counts))}]’)
print(f”{(tree[‘depth’]+1)*’| ‘}[{header_train[tree[‘selected_attr’]]} = {tree[‘values’][0]}]: [{dict((key, value) for (key, value) in zip(val_l, count_l))}]”)
print_tree(tree[‘node0’], train_data, header_train)
print(f”{(tree[‘depth’]+1)*’| ‘}[{header_train[tree[‘selected_attr’]]} = {tree[‘values’][1]}]: [{dict((key, value) for (key, value) in zip(val_r, count_r))}]”)
print_tree(tree[‘node1′], train_data, header_train)

if __name__ == “__main__”:
header_train = np.genfromtxt(sys.argv[1], delimiter=’,’, dtype=None, encoding=’utf-8′)[0, :]
train_data = np.genfromtxt(sys.argv[1], delimiter=’,’, skip_header=1, dtype=None, encoding=’utf-8′)
test_data = np.genfromtxt(sys.argv[2], delimiter=’,’, skip_header=1, dtype=None, encoding=’utf-8′)
max_depth = int(sys.argv[3])
train_out = sys.argv[4]
test_out = sys.argv[5]
metrics_out = sys.argv[6]

tree = recursive_split(train_data, max_depth, 0)
print_tree(tree, train_data, header_train)

train_prediction = decisionTree(train_data, train_data, max_depth)
test_prediction = decisionTree(train_data, test_data, max_depth)

train_error = error(train_prediction, train_data[:, -1])
test_error = error(test_prediction, test_data[:, -1])

with open(train_out, ‘w’) as f:
for row in train_prediction:
f.write(row + ‘\n’)

with open(test_out, ‘w’) as f:
for row in test_prediction:
f.write(row + ‘\n’)

with open(metrics_out, ‘w’) as f:
f.write(f’error(train): {str(train_error)}\nerror(test): {str(test_error)}’)

inspect.py

import sys
import numpy as np
import io

training_data = np.genfromtxt(sys.argv[1], delimiter=’,’, skip_header=1, dtype=None, encoding=’utf-8′)

def entropy(s):
“””
Calculate the entropy of a dataset.
“””
entropy = 0
val, counts = np.unique(s, return_counts=True)
freqs = counts.astype(‘float’)/len(s)
for p in freqs:
if p != 0.0:
entropy -= p * np.log2(p)

majority_vote = val[np.argmax(counts)]
error = len([x for x in s if x!= majority_vote]) / len(s)

with open(sys.argv[2], ‘w’) as f:
f.write(f’entropy: {str(entropy)}\n error: {str(error)}’)

if __name__ == “__main__”:
entropy(training_data[:, -1])