CART算法是不是应该改成这样?大佬们批评指正!
Linjian-PA opened this issue · 0 comments
Linjian-PA commented
#划分数据集
def splitdataset(dataset,axis,value):
retdataset_is =[] #创建返回是value的数据集列表
retdataset_not = [] #创建返回不是value的数据列表
for featVec in dataset:#抽取符合划分特征的值
if featVec[axis]==value:
reducedfeatVec_is=featVec[:axis] #去掉axis特征
reducedfeatVec_is.extend(featVec[axis+1:])#将符合条件的特征添加到返回的数据集列表
retdataset_is.append(reducedfeatVec_is)
if featVec[axis] != value:
reducedfeatVec_not=featVec[:axis] #去掉axis特征
reducedfeatVec_not.extend(featVec[axis+1:])#将符合条件的特征添加到返回的数据集列表
retdataset_not.append(reducedfeatVec_not)
return retdataset_is, retdataset_not
#CART算法
def CART_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
bestGini = 999999.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataset]
uniqueVals = set(featList) # 这里计算出每个feature中unique的值
gini = 0.0
for value in uniqueVals:
print(value)
# is value
subdataset_is=splitdataset(dataset,i,value)[0]
p_is=len(subdataset_is)/float(len(dataset))
subp_is = len(splitdataset(subdataset_is, -1, 'no')[0]) / float(len(subdataset_is))
# is not value
subdataset_not=splitdataset(dataset,i,value)[1]
p_not=len(subdataset_not)/float(len(dataset))
subp_not = len(splitdataset(subdataset_not, -1, 'no')[0]) / float(len(subdataset_not))
gini += p_is * (1.0 - pow(subp_is, 2) - pow(1 - subp_is, 2)) + \
p_not * (1.0 - pow(subp_not, 2) - pow(1 - subp_not, 2))
print(u"CART中第%d个特征的基尼值为:%.3f"%(i,gini))
if (gini < bestGini):
bestGini = gini
bestFeature = i
return bestFeature