added new fixed code

2026-01-10 20:35:21 +01:00
parent 79f93e5c29
commit 8379ac8e12
6 changed files with 396 additions and 110 deletions
--- a/caveman_wavedataset.py
+++ b/caveman_wavedataset.py
@@ -4,29 +4,21 @@ import librosa
 from torch.utils.data import Dataset
 import numpy as np
 import random
-
-
-HOP = 512
-N_FFT = 1024
-DURATION = 2.0
-SR = 44100
-
-
-def audio_to_logmag(audio):
-    # STFT
-    stft = librosa.stft(audio, n_fft=N_FFT, hop_length=HOP)
-    mag = np.abs(stft)
-    logmag = np.log1p(mag)  # log(1 + x) for stability
-    return logmag  # shape: (1, freq_bins, time_frames) = (1, 513, T)
+from settings import SR, N_FFT
+from misc import audio_to_logmag


 class WaveformDataset(Dataset):
-    def __init__(self, lossy_dir, clean_dir, sr=SR, segment_sec=4):
-        self.cache = dict()
+    mean = np.zeros([N_FFT // 2 + 1])
+    std = np.ones([N_FFT // 2 + 1])
+
+    # Duration is a very very important parameter, read the cavemanml.py to see how and why adjust it!!!
+    # For the purposes of this file, it's the length of the audio clip being selected from the dataset.
+    def __init__(self, lossy_dir, clean_dir, segment_duration, sr=SR):
+        self.segment_duration = segment_duration
        self.sr = sr
        self.lossy_dir = lossy_dir
        self.clean_dir = clean_dir
-        self.segment_len = int(segment_sec * sr)
        self.lossy_files = sorted(os.listdir(lossy_dir))
        self.clean_files = sorted(os.listdir(clean_dir))
        self.file_pairs = [
@@ -51,9 +43,9 @@ class WaveformDataset(Dataset):
        min_len = min(len(lossy), len(clean))
        lossy, clean = lossy[:min_len], clean[:min_len]

-        # Random 2-second clip
+        # Random clip

-        clip_len = int(DURATION * SR)
+        clip_len = int(self.segment_duration * SR)
        if min_len < clip_len:
            # pad if too short
            lossy = np.pad(lossy, (0, clip_len - min_len))
@@ -61,14 +53,21 @@ class WaveformDataset(Dataset):
            start = 0
        else:
            start = random.randint(0, min_len - clip_len)
+            # start = 0
            lossy = lossy[start : start + clip_len]
            clean = clean[start : start + clip_len]

+        logmag_x = audio_to_logmag(lossy)
+        logmag_y = audio_to_logmag(clean)
+
+        logmag_x_norm = (logmag_x - self.mean[:, None]) / (self.std[:, None] + 1e-8)
+        logmag_y_norm = (logmag_y - self.mean[:, None]) / (self.std[:, None] + 1e-8)
+
        ans = (
-            torch.from_numpy(audio_to_logmag(lossy)).unsqueeze(0),
-            torch.from_numpy(audio_to_logmag(clean)).unsqueeze(0),
+            torch.from_numpy(logmag_x_norm).float().unsqueeze(0),
+            torch.from_numpy(logmag_y_norm).float().unsqueeze(0),
        )

-        self.cache[idx] = ans
+        # self.cache[idx] = ans

        return ans