OpenMOSS/MOSS

finetune的时候为何没有把<|Human|>的loss给mask掉?

jouw opened this issue · 0 comments

jouw commented
        with open(os.path.join(self.data_dir, f'{self.data_type}.jsonl'), 'r') as f:
            for line in f:
                sample = json.loads(line)

                chat = sample['chat']
                num_turns = int(sample['num_turns'])

                meta_instruction = sample['meta_instruction']
                instruction_ids = self.tokenizer.encode(meta_instruction)
                assert isinstance(instruction_ids, list) and len(instruction_ids) > 0
                
                input_ids = copy.deepcopy(instruction_ids)
                no_loss_spans = [(0, len(instruction_ids))]

                for i in range(num_turns):
                    cur_turn_ids = []
                    cur_no_loss_spans = []
                    cur_turn = chat[f'turn_{i+1}']
                    for key, value in cur_turn.items():

                        cur_ids = self.tokenizer.encode(value)

                        if key == 'Tool Responses':
                            # The format tokens (<|Results|>:...<eor>\n) should have losses. 
                            cur_no_loss_spans.append((len(input_ids + cur_turn_ids) + 5, len(input_ids + cur_turn_ids + cur_ids) - 2))    

                        assert isinstance(cur_ids, list) and len(cur_ids) > 0

                        cur_turn_ids.extend(cur_ids)

                    if len(input_ids + cur_turn_ids) > 2048:
                        break

                    input_ids.extend(cur_turn_ids)
                    no_loss_spans.extend(cur_no_loss_spans)

                if len(input_ids) == len(instruction_ids):
                    continue

                assert len(input_ids) > 0 and len(input_ids) <= 2048

                self.data.append(input_ids)
                self.no_loss_spans.append(no_loss_spans)