Erikfather/Decision_tree-python

CART算法是不是应该改成这样?大佬们批评指正!

Linjian-PA opened this issue · 0 comments

#划分数据集
def splitdataset(dataset,axis,value):
retdataset_is =[] #创建返回是value的数据集列表
retdataset_not = [] #创建返回不是value的数据列表
for featVec in dataset:#抽取符合划分特征的值
if featVec[axis]==value:
reducedfeatVec_is=featVec[:axis] #去掉axis特征
reducedfeatVec_is.extend(featVec[axis+1:])#将符合条件的特征添加到返回的数据集列表
retdataset_is.append(reducedfeatVec_is)

    if featVec[axis] != value:
        reducedfeatVec_not=featVec[:axis] #去掉axis特征
        reducedfeatVec_not.extend(featVec[axis+1:])#将符合条件的特征添加到返回的数据集列表
        retdataset_not.append(reducedfeatVec_not)
return retdataset_is, retdataset_not

#CART算法
def CART_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
bestGini = 999999.0
bestFeature = -1

for i in range(numFeatures):
    featList = [example[i] for example in dataset]
    uniqueVals = set(featList) # 这里计算出每个feature中unique的值
    gini = 0.0
    for value in uniqueVals:
        print(value)
        # is value
        subdataset_is=splitdataset(dataset,i,value)[0]
        p_is=len(subdataset_is)/float(len(dataset))
        subp_is = len(splitdataset(subdataset_is, -1, 'no')[0]) / float(len(subdataset_is))
        
        # is not value
        subdataset_not=splitdataset(dataset,i,value)[1]
        p_not=len(subdataset_not)/float(len(dataset))
        subp_not = len(splitdataset(subdataset_not, -1, 'no')[0]) / float(len(subdataset_not))
        
        gini += p_is * (1.0 - pow(subp_is, 2) - pow(1 - subp_is, 2)) + \
                p_not * (1.0 - pow(subp_not, 2) - pow(1 - subp_not, 2))

    print(u"CART中第%d个特征的基尼值为:%.3f"%(i,gini))
    if (gini < bestGini):
        bestGini = gini
        bestFeature = i
return bestFeature