(22) Text-to-speech, TTS, long text, Edge-TTS
The code in this article uses Edge-TTS to perform text-to-speech operations, which can be stored as mp3 or wav files. There is no limit to the text length.
What is called is the cloud Edge-TTS interface. I just made a simple encapsulation and compiled a UI.
Directly executable files can be downloaded from Baidu Netdisk:
https://pan.baidu.com/s/1ntMnDWFvnS7tLUd9jku8Ew?pwd=hims
code show as below:
#Text to Speech Tool V1.0 import asyncio import traceback # import librosa import edge_tts import os, sys, time import cv2 importyaml import hbt_funcs as hbt from playsound import playsound from PyQt5 import QtWidgets from PyQt5.QtWidgets import QWidget, QMessageBox, QFileDialog, QApplication, QSlider from PyQt5.QtCore import Qt, QTimer, QThread, pyqtSignal, pyqtSlot from txt2audio_UI import Ui_txt2voice asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) hbt.is_contains_chinese(os.getcwd()) #Check whether there is a Chinese path rates = ' + 0%' run_flag = 0 stop_flag = False bar = 0 voices_list = ['XiaoxiaNeural', 'XiaoyiNeural', 'YunxiaNeural', 'liaoning-XiaobeiNeural', 'shaanxi-XiaoniNeural', 'YunjianNeural', 'YunxiNeural', 'YunyangNeural'] my_title = "iCANX text-to-speech tool" settings_file = "settings.yaml" #Read the configuration file icanx_settings.yaml settings = {<!-- -->} if os.path.exists(settings_file): with open(settings_file, 'r') as f: settings = yaml.safe_load(f) if settings == None: settings = {<!-- -->} # Happens if the file is empty. out_dir = settings.get('out_dir', os.getcwd()) voices_select = settings.get('voices_select', 0) mp3_wav = settings.get('mp3_wav', 0) voices = 'zh-CN-' + voices_list[voices_select] # from subprocess import run, PIPE, STDOUT # def get_media_length(file_path): # cmdline = f'ffprobe -i "{file_path}" -show_entries format=duration -v quiet -of csv="p=0"' # # print(cmdline) # result = run(cmdline, stdout=PIPE, stderr=STDOUT) # try: lenth = int(float(result.stdout.decode('utf-8').strip())) # except: lenth = 0; print('ffprobe error in detecting length...') # return length from mutagen.mp3 import MP3 def get_media_length(file_path): audio = MP3(file_path) length = audio.info.length return length class EdgeTTSTrans(QThread): sinout = pyqtSignal(str) def __init__(self, winshot, texts, filename): super(EdgeTTSTrans, self).__init__() self.main_win = winshot self.rates = rates self.texts = texts self.filename = filename + '.mp3' def run(self): try: asyncio.run(self.edge_tts_trans(self.texts)) self.sinout.emit('OK') except: self.sinout.emit('ERROR') async def edge_tts_trans(self, text): communicate = edge_tts.Communicate(text=text, rate=self.rates, voice=voices) await communicate.save(self.filename) class PlayAudioWav(QThread): def __init__(self, winshot, texts): super(PlayAudioWav, self).__init__() self.winshot = winshot self.rates = rates self.texts = texts if os.path.exists("temp.mp3"): os.remove("temp.mp3") def run(self): asyncio.run(self.edge_tts_trans(self.texts)) try: playsound("temp.mp3") except: traceback.print_exc() if os.path.exists("temp.mp3"): os.remove("temp.mp3") self.winshot.try_lisson.setEnabled(True) async def edge_tts_trans(self, text): self.communicate = edge_tts.Communicate(text=text, rate=self.rates, voice=voices) try: # loop = asyncio.get_event_loop() await self.communicate.save('temp.mp3') # loop.run_until_complete(communicate.save('temp.mp3')) except: print('Error in Async...;'); traceback.print_exc() class Winshot(QWidget, Ui_txt2voice): def __init__(self): super(Winshot, self).__init__() self.start_time = 0 self.voice_len = 0 self.setupUi(self) global hwnd, run_flag self.createLayout() self.setWindowTitle(my_title) self.setWindowIcon(hbt.GetIco('ican')) self.setFixedSize(self.size()) self.setWindowFlags(Qt.WindowMinimizeButtonHint) self.my_timer = QTimer(self) self.show(); run_flag = 1 self.text_len = 0 def show_error(self,str): r_button = QMessageBox.question(self, my_title,'\\ \\ ' + str + '\\ \\ ', QMessageBox.Ok) def set_False_Btn(self): self.outButton.setEnabled(False) self.startButton.setEnabled(False) self.quitButton.setEnabled(False) self.out_path.setEnabled(False) def set_True_Btn(self): self.outButton.setEnabled(True) self.startButton.setEnabled(True) self.quitButton.setEnabled(True) self.out_path.setEnabled(True) def start_run(self): global stop_flag self.save_yaml() stop_flag = False; self.set_False_Btn() self.start_time = time.time() text = self.textEdit.toPlainText() self.text_len = len(text) if text == "" : self.show_error('The text in the text box cannot be empty...'); self.set_True_Btn() stop_flag = 1; return # print('Text to be converted:', text) self.filename = out_dir + '/' + time.strftime("%Y_%m_%d_%H.%M.%S") # print(self.filename) self.my_thread = EdgeTTSTrans(self, text, self.filename) self.my_thread.sinout.connect(self.signal_coming) self.my_thread.start() self.my_timer.start(500) self.my_timer.timeout.connect(self.running) # def EdgeTTSTrans(self, text): # async def edge_tts_trans(): # communicate = edge_tts.Communicate(text=text, rate=rates, voice=voices) # await communicate.save('audio.mp3') # asyncio.run(edge_tts_trans()) def signal_coming(self, str): global stop_flag # if os.path.exists("audio.mp3"): os.remove("audio.mp3") if str == 'OK': # self.voice_len = librosa.get_duration(filename=self.filename + '.mp3') self.voice_len = get_media_length(self.filename + '.mp3') total_time = time.time() - self.start_time run_stat_text = f"Statistics information: Text length ({self.text_len} words) | Audio length ({self.voice_len:.1f} seconds) | Time consumed ({total_time:.1f} seconds)" self.run_state.setText(run_stat_text) self.my_timer.stop(); self.progressBar.setValue(100) cv2.waitKey(10) r_button = QMessageBox.question(self, my_title, "\\ \\ \\ Complete this text-to-speech conversion process...\\ \\ Do you need to play it?\\ \\ \ n", QMessageBox.Yes | QMessageBox.No) if r_button == QMessageBox.Yes: try: os.startfile(self.filename + '.mp3') except: print("Unable to play file...") if mp3_wav == 1: os.system(f"sysenv\ffmpeg -i {self.filename}.mp3 {self.filename}.wav") else: self.show_error('An error occurred during the conversion process...\\ Possible reasons:\\ The file or directory cannot contain Chinese...\\ The network is unavailable...\\ The network cannot use a proxy... ') self.set_True_Btn() self.progressBar.setValue(0) def running(self): global bar bar + = 2 total_time = time.time() - self.start_time self.progressBar.setValue(bar) if bar >= 100: bar = 0 run_stat_text = f"Statistics information: Text length ({self.text_len} words) | Audio length ({self.voice_len:.1f} seconds) | Time consumed ({total_time:.1f} seconds)" self.run_state.setText(run_stat_text) def helpWin(self): str="\\ \\ \\ The copyright of this software belongs to: XXX Website: www.xxx.com \\ \\ \\ " QMessageBox.question(self, my_title, str, QMessageBox.Ok) def quitWin(self): r_button = QMessageBox.question(self, "my_title", "\\ \\ \\ Exiting will terminate this program...\\ \\ Are you sure to exit? \\ \\ \\ ", QMessageBox.Yes | QMessageBox. No) self.save_yaml() if r_button == QMessageBox.Yes: sys.exit() def outButton_fuc(self): global out_dir tmp_path = out_dir out_dir = QFileDialog.getExistingDirectory(self,'Select the converted output folder', out_dir) if out_dir == '': out_dir = tmp_path self.out_path.setText(out_dir) # print('Selected save directory:', out_dir) def open_fold_fuc(self): try: os.startfile(out_dir) except:pass def rates_slider_fuc(self): global rates self.audio_rates.setText(f'{self.rates_slider.value()}%') _rates = self.rates_slider.value() if _rates >= 0: rates = f' + {_rates}%' else: rates = f'{_rates}%' def click_audio_select(self, str1): global voices, voices_select voices_select = self.audio_select.currentIndex() voices = 'zh-CN-' + voices_list[voices_select] print('Selected voice:', voices) def click_try_lisson(self, str1): self.try_lisson.setEnabled(False) text = "Thank you for choosing my voice" self.play_thread = PlayAudioWav(self, text) #Start the Play thread self.play_thread.start() def click_checkBox_mp3(self): global mp3_wav mp3_wav = 0 def click_checkBox_wav(self): global mp3_wav mp3_wav=1 def click_textEdit(self): txt_len = len(self.textEdit.toPlainText()) print(self.textEdit.toPlainText()) self.run_state.setText(f"Statistics: Text length ({txt_len} words)") def save_yaml(self): settings = {<!-- -->'out_dir': out_dir, 'voices_select': voices_select, 'mp3_wav': mp3_wav} with open(settings_file, 'w + ') as f: yaml.dump(settings, f) def createLayout(self): self.out_path.setText(out_dir) if mp3_wav == 0: self.checkBox_mp3.setChecked(True) else: self.checkBox_wav.setChecked(True) self.checkBox_mp3.stateChanged.connect(self.click_checkBox_mp3) self.checkBox_wav.stateChanged.connect(self.click_checkBox_wav) self.outButton.clicked.connect(self.outButton_fuc) self.chk_outputfile.clicked.connect(self.open_fold_fuc) self.try_lisson.clicked.connect(self.click_try_lisson) self.textEdit.textChanged.connect(self.click_textEdit) self.textEdit.setPlainText("This software uses Microsoft Edge-TTS to quickly convert text into speech.") self.startButton.clicked.connect(self.start_run) self.helpButton.clicked.connect(self.helpWin) self.quitButton.clicked.connect(self.quitWin) self.rates_slider.setTickPosition(QSlider.TicksAbove) self.rates_slider.valueChanged.connect(self.rates_slider_fuc) self.audio_select.addItems(['Xiaoxiao:Female', 'Xiaoyi:Female', 'Yunxia:Female', 'Northeast:Female', 'Shaanxi:Female', \ 'Yunjian:Male', 'Yunxi:Male','Yunyang:Male']) self.audio_select.setCurrentIndex(voices_select) self.audio_select.activated[str].connect(self.click_audio_select) #if __name__ == '__main__': QApplication.setAttribute(Qt.AA_EnableHighDpiScaling) app = QtWidgets.QApplication(sys.argv) winshot = Winshot() sys.exit(app.exec_())
The UI code is as follows:
# -*- coding: utf-8 -*- # Form implementation generated from reading ui file 'txt2audio_UI.ui' # # Created by: PyQt5 UI code generator 5.15.2 # # WARNING: Any manual changes made to this file will be lost when pyuic5 is # run again. Do not edit this file unless you know what you are doing. from PyQt5 import QtCore, QtGui, QtWidgets class Ui_txt2voice(object): def setupUi(self, txt2voice): txt2voice.setObjectName("txt2voice") txt2voice.resize(435, 431) self.startButton = QtWidgets.QPushButton(txt2voice) self.startButton.setGeometry(QtCore.QRect(160, 371, 91, 23)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.startButton.setFont(font) self.startButton.setObjectName("startButton") self.helpButton = QtWidgets.QPushButton(txt2voice) self.helpButton.setGeometry(QtCore.QRect(270, 371, 61, 23)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.helpButton.setFont(font) self.helpButton.setObjectName("helpButton") self.quitButton = QtWidgets.QPushButton(txt2voice) self.quitButton.setGeometry(QtCore.QRect(350, 371, 61, 23)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.quitButton.setFont(font) self.quitButton.setObjectName("quitButton") self.textEdit = QtWidgets.QPlainTextEdit(txt2voice) self.textEdit.setGeometry(QtCore.QRect(20, 30, 391, 175)) self.textEdit.setObjectName("textEdit") self.chk_outputfile = QtWidgets.QPushButton(txt2voice) self.chk_outputfile.setGeometry(QtCore.QRect(20, 371, 61, 23)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.chk_outputfile.setFont(font) self.chk_outputfile.setObjectName("chk_outputfile") self.outButton = QtWidgets.QPushButton(txt2voice) self.outButton.setGeometry(QtCore.QRect(20, 280, 61, 21)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.outButton.setFont(font) self.outButton.setObjectName("outButton") self.out_path = QtWidgets.QLabel(txt2voice) self.out_path.setGeometry(QtCore.QRect(90, 280, 311, 20)) self.out_path.setObjectName("out_path") self.lbl_3 = QtWidgets.QLabel(txt2voice) self.lbl_3.setGeometry(QtCore.QRect(26, 222, 51, 16)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.lbl_3.setFont(font) self.lbl_3.setObjectName("lbl_3") self.audio_select = QtWidgets.QComboBox(txt2voice) self.audio_select.setGeometry(QtCore.QRect(86, 221, 71, 18)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.audio_select.setFont(font) self.audio_select.setObjectName("audio_select") self.rates_slider = QtWidgets.QSlider(txt2voice) self.rates_slider.setGeometry(QtCore.QRect(271, 219, 111, 20)) self.rates_slider.setMinimum(-99) self.rates_slider.setTracking(True) self.rates_slider.setOrientation(QtCore.Qt.Horizontal) self.rates_slider.setInvertedAppearance(False) self.rates_slider.setInvertedControls(False) self.rates_slider.setObjectName("rates_slider") self.aud = QtWidgets.QLabel(txt2voice) self.aud.setGeometry(QtCore.QRect(211, 220, 61, 20)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.aud.setFont(font) self.aud.setObjectName("aud") self.lbl_4 = QtWidgets.QLabel(txt2voice) self.lbl_4.setGeometry(QtCore.QRect(27, 336, 61, 16)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.lbl_4.setFont(font) self.lbl_4.setObjectName("lbl_4") self.progressBar = QtWidgets.QProgressBar(txt2voice) self.progressBar.setGeometry(QtCore.QRect(87, 340, 321, 8)) self.progressBar.setProperty("value", 0) self.progressBar.setTextVisible(False) self.progressBar.setInvertedAppearance(False) self.progressBar.setObjectName("progressBar") self.try_lisson = QtWidgets.QPushButton(txt2voice) self.try_lisson.setGeometry(QtCore.QRect(160, 220, 31, 21)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.try_lisson.setFont(font) self.try_lisson.setObjectName("try_lisson") self.line = QtWidgets.QFrame(txt2voice) self.line.setGeometry(QtCore.QRect(0, 401, 441, 16)) self.line.setFrameShape(QtWidgets.QFrame.HLine) self.line.setFrameShadow(QtWidgets.QFrame.Sunken) self.line.setObjectName("line") self.run_state = QtWidgets.QLabel(txt2voice) self.run_state.setGeometry(QtCore.QRect(10, 410, 381, 20)) self.run_state.setObjectName("run_state") self.audio_file_path_txt_2 = QtWidgets.QLabel(txt2voice) self.audio_file_path_txt_2.setGeometry(QtCore.QRect(23, 10, 241, 16)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.audio_file_path_txt_2.setFont(font) self.audio_file_path_txt_2.setObjectName("audio_file_path_txt_2") self.lbl_5 = QtWidgets.QLabel(txt2voice) self.lbl_5.setGeometry(QtCore.QRect(26, 252, 61, 16)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.lbl_5.setFont(font) self.lbl_5.setObjectName("lbl_5") self.checkBox_mp3 = QtWidgets.QCheckBox(txt2voice) self.checkBox_mp3.setGeometry(QtCore.QRect(90, 252, 41, 16)) self.checkBox_mp3.setObjectName("checkBox_mp3") self.buttonGroup = QtWidgets.QButtonGroup(txt2voice) self.buttonGroup.setObjectName("buttonGroup") self.buttonGroup.addButton(self.checkBox_mp3) self.checkBox_wav = QtWidgets.QCheckBox(txt2voice) self.checkBox_wav.setGeometry(QtCore.QRect(140, 252, 68, 16)) self.checkBox_wav.setObjectName("checkBox_wav") self.buttonGroup.addButton(self.checkBox_wav) self.line_2 = QtWidgets.QFrame(txt2voice) self.line_2.setGeometry(QtCore.QRect(0, 310, 441, 16)) self.line_2.setFrameShape(QtWidgets.QFrame.HLine) self.line_2.setFrameShadow(QtWidgets.QFrame.Sunken) self.line_2.setObjectName("line_2") self.audio_rates = QtWidgets.QLabel(txt2voice) self.audio_rates.setGeometry(QtCore.QRect(388, 220, 31, 20)) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(9) self.audio_rates.setFont(font) self.audio_rates.setObjectName("audio_rates") self.retranslateUi(txt2voice) QtCore.QMetaObject.connectSlotsByName(txt2voice) def retranslateUi(self, txt2voice): _translate = QtCore.QCoreApplication.translate txt2voice.setWindowTitle(_translate("txt2voice", "AI")) self.startButton.setText(_translate("txt2voice", "Start Translate")) self.helpButton.setText(_translate("txt2voice", "help")) self.quitButton.setText(_translate("txt2voice", "Quit")) self.chk_outputfile.setText(_translate("txt2voice", "View results")) self.outButton.setText(_translate("txt2voice", "Output directory")) self.out_path.setText(_translate("txt2voice", "Generated video output directory")) self.lbl_3.setText(_translate("txt2voice", "Voice selection:")) self.aud.setText(_translate("txt2voice", "Speech speed selection:")) self.lbl_4.setText(_translate("txt2voice", "Conversion progress:")) self.try_lisson.setText(_translate("txt2voice", "Audition")) self.run_state.setText(_translate("txt2voice", "Statistics:")) self.audio_file_path_txt_2.setText(_translate("txt2voice", "Please enter text:")) self.lbl_5.setText(_translate("txt2voice", "Output format:")) self.checkBox_mp3.setText(_translate("txt2voice", "MP3")) self.checkBox_wav.setText(_translate("txt2voice", "WAV")) self.audio_rates.setText(_translate("txt2voice", "0%"))