The results calculated according to the formula described in the doc are different from the results displayed by the model.

Question

The results calculated according to the formula described in the doc are different from the results displayed by the model.

ccylance opened this issue a month ago · 0 comments

catboost version: catboost 1.2
Operating System: macos 14.4.1
CPU: M1
GPU: no

According to the official website, the l2 score function is calculated here https://catboost.ai/en/docs/concepts/algorithm-score-functions
according to the function as the website showed, the best split for the depth 1 is result should be 7.67857143 but the model showed 3.4375
Here is my code：

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool

class RmseObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        print('preds:', approxes, len(approxes))
        for index in range(len(targets)):
            der1 = targets[index] - approxes[index]
            der2 = 0
            # print("weights:", weights)
            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        # print("res:", result)
        return result
    
class RmseMetric(object):
    def get_final_error(self, error, weight):
        return np.sqrt(error / (weight + 1e-38))

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0
        print("approx:", approx)
        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((approx[i] - target[i])**2)

        return error_sum, weight_sum

d = pd.DataFrame()
d['f_1'] = np.array([ 11.625     ,  2.75      , 10.40625   ,  5.8125    ,  4.125     ,
        7.60714286, 10.65      , 13.3125    ,  8.25      , 12.75      ,
        8.875     ,  8.53125   , 11.625     , 11.89285714, 11.375     ,
        7.75   ])
y = np.arange(len(d))
model = CatBoostRegressor(
                        iterations=1, depth = 1, boosting_type='Ordered',
                        has_time = True,
                        leaf_estimation_method='Simple',
                        l2_leaf_reg= 0,
                        learning_rate=1,
                        loss_function=RmseObjective(),
                        eval_metric=RmseMetric(),
                        score_function = "L2",
                        bootstrap_type = "No"
                        )
pools = Pool(d, y)
model.fit(pools, silent=True)
model.plot_tree(tree_idx=0, pool=pools)

Here are the results: