piotrkawa/deepfake-whisper-features

Fix for `deepfake_asvspoof_dataset.py`

Closed this issue · 0 comments

Code Changes:

From:

    def read_protocol(self):
        samples = {
            "sample_name": [],
            "label": [],
            "path": [],
            "attack_type": [],
        }

        real_samples = []
        fake_samples = []
        with open(Path(self.path) / self.protocol_file_name, "r") as file:
            for line in file:
                label = line.strip().split(" ")[5]

                if label == "bonafide":
                    real_samples.append(line)
                elif label == "spoof":
                    fake_samples.append(line)

        fake_samples = self.split_samples(fake_samples)
        for line in fake_samples:
            samples = self.add_line_to_samples(samples, line)

        real_samples = self.split_samples(real_samples)
        for line in real_samples:
            samples = self.add_line_to_samples(samples, line)

        return pd.DataFrame(samples)

    def add_line_to_samples(self, samples, line):
        _, sample_name, _, _, _, label, _, _ = line.strip().split(" ")
        samples["sample_name"].append(sample_name)
        samples["label"].append(label)
        samples["attack_type"].append(label)

        sample_path = self.flac_paths[sample_name]
        assert sample_path.exists()
        samples["path"].append(sample_path)

        return samples

To:

    def read_protocol(self):
        samples = {
            "sample_name": [],
            "label": [],
            "path": [],
            "attack_type": [],
        }

        real_samples = []
        fake_samples = []
        with open(Path(self.path) / self.protocol_file_name, "r") as file:
            for line in file:
                label = line.strip().split(" ")[5]

                if label == "bonafide":
                    real_samples.append(line)
                elif label == "spoof":
                    fake_samples.append(line)

        fake_samples = self.split_samples(fake_samples)
        for line in fake_samples:
            new_samples = self.add_line_to_samples(samples, line)
            if new_samples is not None:
                samples = new_samples

        real_samples = self.split_samples(real_samples)
        for line in real_samples:
            new_samples = self.add_line_to_samples(samples, line)
            if new_samples is not None:
                samples = new_samples

        return pd.DataFrame(samples)


    def add_line_to_samples(self, samples, line):
        try:
            split_line = line.strip().split(" ")
            if len(split_line) < 8:
                print(f"Error: Not enough elements to unpack from line '{line}'. Skipping.")
                return
            
            # Assuming that sample_name is always the second element and label is always the sixth element
            sample_name = split_line[1]
            label = split_line[5]

        except Exception as e:
            print(f"Error: {e}. Couldn't process line '{line}'. Skipping.")
            return
        
        samples["sample_name"].append(sample_name)
        samples["label"].append(label)
        samples["attack_type"].append(label)

        sample_path = self.flac_paths.get(sample_name)
        if sample_path is None or not sample_path.exists():
            print(f"Error: Sample path for '{sample_name}' does not exist. Skipping.")
            return
        
        samples["path"].append(sample_path)

        return samples

Dataset file structure should be like this:

.ASVspoof2021
└── DF
    ├── ASVspoof2021.DF.cm.eval.trl.txt
    ├── ASVspoof2021_DF_eval_part00
    │   └── ASVspoof2021_DF_eval
    │       └── flac
    ├── ASVspoof2021_DF_eval_part01
    │   └── ASVspoof2021_DF_eval
    │       └── flac
    ├── ASVspoof2021_DF_eval_part02
    │   └── ASVspoof2021_DF_eval
    │       └── flac
    ├── ASVspoof2021_DF_eval_part03
    │   └── ASVspoof2021_DF_eval
    │       └── flac
    ├── keys
    │   ├── CM
    │   │   ├── CQCC-GMM
    │   │   │   └── score.txt
    │   │   ├── LFCC-GMM
    │   │   │   └── score.txt
    │   │   ├── LFCC-LCNN
    │   │   │   └── score.txt
    │   │   ├── RawNet2
    │   │   │   └── score.txt
    │   │   └── trial_metadata.txt
    │   └── README.txt
    ├── LICENSE.DF.txt
    └── README.DF.txt