|
def preprocess_function(examples): |
|
|
|
queries = examples["sentence"] |
|
queries = get_detailed_instruct(task, queries) |
|
batch_dict = tokenizer(queries, max_length=args.max_length - 1, return_attention_mask=False, padding=False, truncation=True) |
|
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']] |
|
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt') |
|
|
|
result = {f"sentence_{k}": v for k, v in batch_dict.items()} |
|
|
|
queries = examples["positive"] |
|
batch_dict = tokenizer(queries, max_length=args.max_length - 1, return_attention_mask=False, padding=False, truncation=True) |
|
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']] |
|
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt') |
|
|
|
for k, v in batch_dict.items(): |
|
result[f"positive_{k}"] = v |
|
|
|
queries = examples["negative"] |
|
batch_dict = tokenizer(queries, max_length=args.max_length - 1, return_attention_mask=False, padding=False, truncation=True) |
|
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']] |
|
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt') |
|
|
|
for k, v in batch_dict.items(): |
|
result[f"negative_{k}"] = v |
|
|
|
result["labels"] = [0] * len(examples["sentence"]) |
|
return result |
|
|
|
processed_datasets = dataset.map( |
|
preprocess_function, |