catboost/catboost

The results calculated according to the formula described in the doc are different from the results displayed by the model.

ccylance opened this issue · 0 comments

catboost version: catboost 1.2
Operating System: macos 14.4.1
CPU: M1
GPU: no

According to the official website, the l2 score function is calculated here https://catboost.ai/en/docs/concepts/algorithm-score-functions
according to the function as the website showed, the best split for the depth 1 is result should be 7.67857143 but the model showed 3.4375
Here is my code:

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool

class RmseObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        print('preds:', approxes, len(approxes))
        for index in range(len(targets)):
            der1 = targets[index] - approxes[index]
            der2 = 0
            # print("weights:", weights)
            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        # print("res:", result)
        return result
    
class RmseMetric(object):
    def get_final_error(self, error, weight):
        return np.sqrt(error / (weight + 1e-38))

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0
        print("approx:", approx)
        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((approx[i] - target[i])**2)

        return error_sum, weight_sum

d = pd.DataFrame()
d['f_1'] = np.array([ 11.625     ,  2.75      , 10.40625   ,  5.8125    ,  4.125     ,
        7.60714286, 10.65      , 13.3125    ,  8.25      , 12.75      ,
        8.875     ,  8.53125   , 11.625     , 11.89285714, 11.375     ,
        7.75   ])
y = np.arange(len(d))
model = CatBoostRegressor(
                        iterations=1, depth = 1, boosting_type='Ordered',
                        has_time = True,
                        leaf_estimation_method='Simple',
                        l2_leaf_reg= 0,
                        learning_rate=1,
                        loss_function=RmseObjective(),
                        eval_metric=RmseMetric(),
                        score_function = "L2",
                        bootstrap_type = "No"
                        )
pools = Pool(d, y)
model.fit(pools, silent=True)
model.plot_tree(tree_idx=0, pool=pools)

image

Here are the results: