diff --git a/client/demo/dist/assets/gui_settings/GUI.json b/client/demo/dist/assets/gui_settings/GUI.json index ccf3be9e5..c43596283 100644 --- a/client/demo/dist/assets/gui_settings/GUI.json +++ b/client/demo/dist/assets/gui_settings/GUI.json @@ -21,7 +21,7 @@ { "name": "configArea", "options": { - "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"], + "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx", "fcpe" ], "inputChunkNums": [1, 2, 4, 6, 8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048, 4096, 8192, 16384] } } diff --git a/client/demo/public/assets/gui_settings/GUI.json b/client/demo/public/assets/gui_settings/GUI.json index ccf3be9e5..e234f1a40 100644 --- a/client/demo/public/assets/gui_settings/GUI.json +++ b/client/demo/public/assets/gui_settings/GUI.json @@ -21,7 +21,7 @@ { "name": "configArea", "options": { - "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"], + "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx", "fcpe"], "inputChunkNums": [1, 2, 4, 6, 8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048, 4096, 8192, 16384] } } diff --git a/client/lib/src/const.ts b/client/lib/src/const.ts index 69d3f1939..76db00f65 100644 --- a/client/lib/src/const.ts +++ b/client/lib/src/const.ts @@ -56,6 +56,7 @@ export const F0Detector = { crepe_tiny: "crepe_tiny", rmvpe: "rmvpe", rmvpe_onnx: "rmvpe_onnx", + fcpe: "fcpe", } as const; export type F0Detector = (typeof F0Detector)[keyof typeof F0Detector]; diff --git a/server/const.py b/server/const.py index 15f758c39..eaa1a9ceb 100644 --- a/server/const.py +++ b/server/const.py @@ -82,6 +82,7 @@ class EnumInferenceTypes(Enum): "crepe_tiny", "rmvpe", "rmvpe_onnx", + "fcpe", ] ServerAudioDeviceType: TypeAlias = Literal["audioinput", "audiooutput"] diff --git a/server/requirements.txt b/server/requirements.txt index 4a5e79e7c..8cbebe89b 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -27,3 +27,4 @@ websockets==11.0.2 sounddevice==0.4.6 dataclasses_json==0.5.7 onnxsim==0.4.28 +torchfcpe diff --git a/server/voice_changer/RVC/pitchExtractor/FcpePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/FcpePitchExtractor.py new file mode 100644 index 000000000..d90c3421f --- /dev/null +++ b/server/voice_changer/RVC/pitchExtractor/FcpePitchExtractor.py @@ -0,0 +1,44 @@ +import numpy as np +from const import PitchExtractorType +from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager +from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor +import torchfcpe + +class FcpePitchExtractor(PitchExtractor): + + def __init__(self, gpu: int): + super().__init__() + self.pitchExtractorType: PitchExtractorType = "fcpe" + self.device = DeviceManager.get_instance().getDevice(gpu) + self.fcpe = torchfcpe.spawn_bundled_infer_model(self.device) + + # I merge the code of Voice-Changer-CrepePitchExtractor and RVC-fcpe-infer, sry I don't know how to optimize the function. + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): + start_frame = int(silence_front * sr / window) + real_silence_front = start_frame * window / sr + + silence_front_offset = int(np.round(real_silence_front * sr)) + audio = audio[silence_front_offset:] + + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = self.fcpe.infer( + audio.to(self.device).unsqueeze(0).float(), + sr=16000, + decoder_mode="local_argmax", + threshold=0.006, + ) + f0 = f0.squeeze() + + f0 *= pow(2, f0_up_key / 12) + pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + return pitch_coarse, pitchf diff --git a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py index 6df0e894a..1032e7bc4 100644 --- a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py +++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py @@ -43,6 +43,9 @@ def loadPitchExtractor( return RMVPEPitchExtractor(cls.params.rmvpe, gpu) elif pitchExtractorType == "rmvpe_onnx": return RMVPEOnnxPitchExtractor(cls.params.rmvpe_onnx, gpu) + elif pitchExtractorType == "fcpe": + # add the FcpePitchExtractor + return FcpePitchExtractor(gpu) else: # return hubert as default print("[Voice Changer] PitchExctractor not found", pitchExtractorType)