add ogg vorbis support, multiple output types, clean up, autodetect format from extension if not specified, add basic sample rate, bit depth and num channels support

main
vampi 2 months ago
parent 5e4ef9eb30
commit 3823cf0f66

@ -14,6 +14,9 @@
#include <io.h>
#include <fcntl.h>
#include <vorbis/vorbisenc.h>
#include <opus.h>
#include "getoptw.h"
const WCHAR *getErrorString(HRESULT r) {
@ -23,8 +26,8 @@ const WCHAR *getErrorString(HRESULT r) {
return L"Unknown";
}
void printJsonString(const WCHAR* in) {
if (!in) {
void printJsonString(const WCHAR *in) {
if(!in) {
fwprintf(stdout, L"null");
return;
}
@ -39,9 +42,9 @@ void printJsonString(const WCHAR* in) {
fputwc(in[i], stdout);
}
fputwc(L'"', stdout);
}
}
void printJsonKeyPair(const WCHAR* key, const WCHAR* value, int skipComma = 0) {
void printJsonKeyPair(const WCHAR *key, const WCHAR *value, int skipComma = 0) {
printJsonString(key);
wprintf(L": ");
printJsonString(value);
@ -78,18 +81,18 @@ int listVoices() {
return 1;
}
WCHAR* idString = 0L;
WCHAR *idString = 0L;
hr = cpVoiceToken->GetId(&idString);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token ID: %d %s\n", hr, getErrorString(hr));
return 1;
}
wprintf(L"{\n");
WCHAR* idBasename = 0L;
WCHAR *idBasename = 0L;
idBasename = wcsrchr(idString, '\\');
printJsonKeyPair(L"id", idBasename && idBasename[0] ? idBasename + 1 : idString);
WCHAR* descriptionString = 0L;
WCHAR *descriptionString = 0L;
hr = SpGetDescription(cpVoiceToken, &descriptionString);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token description: %d %s\n", hr, getErrorString(hr));
@ -104,26 +107,26 @@ int listVoices() {
return 1;
}
WCHAR* age;
WCHAR *age;
cpSpAttributesKey->GetStringValue(L"Age", &age);
printJsonKeyPair(L"age", age);
WCHAR* gender;
WCHAR *gender;
cpSpAttributesKey->GetStringValue(L"Gender", &gender);
printJsonKeyPair(L"gender", gender);
WCHAR* language;
WCHAR *language;
cpSpAttributesKey->GetStringValue(L"Language", &language);
WCHAR strNameBuffer[LOCALE_NAME_MAX_LENGTH] = {0};
WCHAR strNameBuffer[LOCALE_NAME_MAX_LENGTH] = { 0 };
int langId = wcstol(language, NULL, 16);
LCIDToLocaleName(langId, strNameBuffer, LOCALE_NAME_MAX_LENGTH, 0);
printJsonKeyPair(L"language", strNameBuffer);
WCHAR* name;
WCHAR *name;
cpSpAttributesKey->GetStringValue(L"Name", &name);
printJsonKeyPair(L"name", name);
WCHAR* vendor;
WCHAR *vendor;
cpSpAttributesKey->GetStringValue(L"Vendor", &vendor);
printJsonKeyPair(L"vendor", vendor, 1);
@ -158,8 +161,8 @@ int addLexemes() {
const struct {
LANGID langId;
const WCHAR* word;
const WCHAR* phone;
const WCHAR *word;
const WCHAR *phone;
} lexemes[] = {
{ langidUS, L"cum", L"k uw m" },
{ langidUS, L"poo", L"p uw" },
@ -196,59 +199,343 @@ int addLexemes() {
return 0;
}
class PooSpStream: public ISpStreamFormat, public ISpEventSink {
class BaseSpStream: public ISpStream, public ISpEventSink {
public:
CComPtr<ISpEventSink> sink;
LPCWSTR filename;
WAVEFORMATEX wfex;
const GUID *formatId;
ULONGLONG ullEventInterest;
HANDLE h;
PooSpStream() {
h = GetStdHandle(STD_OUTPUT_HANDLE);
}
BOOL isStdout;
BaseSpStream() {}
STDMETHODIMP QueryInterface(REFIID riid, void** ppv) {
if (ppv == NULL) return E_INVALIDARG;
STDMETHODIMP QueryInterface(REFIID riid, void **ppv) {
if(ppv == NULL) return E_INVALIDARG;
*ppv = NULL;
if (riid == IID_IUnknown || riid == IID_ISequentialStream || riid == IID_IStream || riid == IID_ISpStreamFormat)
*ppv = static_cast<ISpStreamFormat*>(this);
else if (riid == IID_ISpEventSink)
*ppv = static_cast<ISpEventSink*>(this);
if(riid == IID_IUnknown || riid == IID_ISequentialStream || riid == IID_IStream || riid == IID_ISpStreamFormat || riid == IID_ISpStream)
*ppv = static_cast<ISpStreamFormat *>(this);
else if(riid == IID_ISpEventSink)
*ppv = static_cast<ISpEventSink *>(this);
else return E_NOINTERFACE;
return S_OK;
}
STDMETHODIMP_(ULONG) AddRef(void) { return 1; }
STDMETHODIMP_(ULONG) Release(void) { return 1; }
HRESULT STDMETHODCALLTYPE Read(void*, ULONG, ULONG*) { wprintf(L"Read\n"); return 0; }
HRESULT STDMETHODCALLTYPE Write(const void* buf, ULONG size, ULONG* newPos) {
WriteFile(h, buf, size, newPos, NULL);
return S_OK;
}
HRESULT STDMETHODCALLTYPE Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER* plibNewPosition) {
STDMETHODIMP Read(void *, ULONG, ULONG *) { return 0; }
virtual STDMETHODIMP Write(const void *buf, ULONG size, ULONG *newPos) = 0;
STDMETHODIMP Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER *plibNewPosition) {
if(plibNewPosition)
plibNewPosition->QuadPart = dlibMove.QuadPart;
return S_OK;
}
HRESULT STDMETHODCALLTYPE SetSize(ULARGE_INTEGER) { return 0; }
HRESULT STDMETHODCALLTYPE CopyTo(IStream*, ULARGE_INTEGER, ULARGE_INTEGER*, ULARGE_INTEGER*) { return 0; }
HRESULT STDMETHODCALLTYPE Commit(DWORD) { return 0; }
HRESULT STDMETHODCALLTYPE Revert(void) { return 0; }
HRESULT STDMETHODCALLTYPE LockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return 0; }
HRESULT STDMETHODCALLTYPE UnlockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return 0; }
HRESULT STDMETHODCALLTYPE Stat(STATSTG*, DWORD) { return 0; }
HRESULT STDMETHODCALLTYPE Clone(IStream**) { return 0; }
HRESULT STDMETHODCALLTYPE GetFormat(GUID* pguidFormatId, WAVEFORMATEX**format) {
return SpConvertStreamFormatEnum(SPSF_16kHz16BitMono, pguidFormatId, format);
}
HRESULT STDMETHODCALLTYPE AddEvents(const SPEVENT* pEventArray, ULONG ulCount) {
STDMETHODIMP SetSize(ULARGE_INTEGER) { return 0; }
STDMETHODIMP CopyTo(IStream *, ULARGE_INTEGER, ULARGE_INTEGER *, ULARGE_INTEGER *) { return 0; }
STDMETHODIMP Commit(DWORD) { return 0; }
STDMETHODIMP Revert(void) { return 0; }
STDMETHODIMP LockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return 0; }
STDMETHODIMP UnlockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return 0; }
STDMETHODIMP Stat(STATSTG *, DWORD) { return 0; }
STDMETHODIMP Clone(IStream **) { return 0; }
STDMETHODIMP GetFormat(GUID *pguidFormatId, WAVEFORMATEX **format) {
*pguidFormatId = *formatId;
WAVEFORMATEX *pwfex = (WAVEFORMATEX *)::CoTaskMemAlloc(sizeof(WAVEFORMATEX));
CopyMemory(pwfex, &wfex, sizeof(WAVEFORMATEX));
*format = pwfex;
return S_OK;
}
virtual STDMETHODIMP writeEventData(void *buf, ULONG sz) = 0;
const WCHAR *event_names[15] = {
L"undefined",
L"startInputStream",
L"endInputStream",
L"voiceChange",
L"ttsBookmark",
L"wordBoundary",
L"phoneme",
L"sentenceBoundary",
L"viseme",
L"ttsAudioLevel",
};
STDMETHODIMP writeJsonEvent(const SPEVENT *ev) {
WCHAR buf[MAX_PATH];
ULONGLONG timeMs = ev->ullAudioStreamOffset * 1000 / 16000;
_snwprintf_s(buf, sizeof(buf) / sizeof(buf[0]), L"{\"type\":\"%s\",\"timeMs\":%llu}\n", event_names[ev->eEventId < 15 ? ev->eEventId : 0], timeMs);
writeEventData(buf, wcslen(buf) * sizeof(buf[0]));
return S_OK;
}
// FIXME: optimize by not allocating every time
STDMETHODIMP writeSpEvent(const SPEVENT *ev) {
CSpEvent cspev;
cspev.CopyFrom(ev);
ULONG sz = cspev.SerializeSize<SPSERIALIZEDEVENT>();
BYTE *buf = new BYTE[sz];
cspev.Serialize<SPSERIALIZEDEVENT>((SPSERIALIZEDEVENT *)buf);
writeEventData(buf, sz);
delete[] buf;
return S_OK;
}
STDMETHODIMP AddEvents(const SPEVENT *pEventArray, ULONG ulCount) {
for(ULONG i = 0; i < ulCount; i++) {
const SPEVENT *ev = &pEventArray[i];
writeSpEvent(ev);
}
return S_OK;
}
STDMETHODIMP GetEventInterest(ULONGLONG *pullEventInterest) {
*pullEventInterest = ullEventInterest;
return S_OK;
}
STDMETHODIMP SetBaseStream(IStream *pStream, REFGUID rguidFormat, const WAVEFORMATEX *pWaveFormatEx) {
wprintf(L"SetBaseStream\n");
return S_OK;
}
STDMETHODIMP GetBaseStream(IStream **ppStream) {
wprintf(L"GetBaseStream\n");
return S_OK;
}
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
wprintf(L"BindToFile filename_=\"%s\" ullEventInterest=0x%04llx\n", filename_, ullEventInterest_);
filename = filename_;
ullEventInterest = ullEventInterest_;
formatId = pFormatId;
CopyMemory(&wfex, pWaveFormatEx, sizeof(WAVEFORMATEX));
isStdout = filename_ && filename[0] == '-' && filename[1] == 0;
if(isStdout) {
h = GetStdHandle(STD_OUTPUT_HANDLE);
} else {
h = CreateFileW(filename, GENERIC_WRITE, FILE_SHARE_READ, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, 0);
if(h == INVALID_HANDLE_VALUE) {
DWORD e = GetLastError();
WCHAR buf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, buf, sizeof(buf) / sizeof(buf[0]), 0);
fwprintf(stderr, L"Could not open \"%s\" for writing: %d %s\n", filename_, e, buf);
return HRESULT_FROM_WIN32(e);
}
}
return S_OK;
}
virtual STDMETHODIMP Close(void) {
wprintf(L"Close\n");
if(!isStdout && h) {
BOOL b = CloseHandle(h);
if(!b) {
DWORD e = GetLastError();
WCHAR buf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, buf, sizeof(buf) / sizeof(buf[0]), 0);
fwprintf(stderr, L"Could not close \"%s\": %d (%s)", filename, e, buf);
return HRESULT_FROM_WIN32(e);
}
}
return S_OK;
}
};
class RawSpStream: public BaseSpStream {
public:
HANDLE eh; // events file handle
RawSpStream() {}
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
HRESULT hr = BaseSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
if(FAILED(hr)) return hr;
if(isStdout) {
eh = (HANDLE)_get_osfhandle(3);
} else if(ullEventInterest) {
fwprintf(stderr, L"Cannot select events when output is not stdout\n");
return E_INVALIDARG;
}
return S_OK;
}
HRESULT STDMETHODCALLTYPE Write(const void *buf, ULONG size, ULONG *newPos) {
WriteFile(h, buf, size, newPos, 0);
return S_OK;
}
// FIXME: error checking
STDMETHODIMP writeEventData(void *buf, ULONG sz) {
if(!eh) return E_FAIL;
WriteFile(eh, buf, sz, 0, 0);
return S_OK;
}
};
class OggSpStream: public BaseSpStream {
public:
ogg_stream_state ogg_voice_st;
ogg_stream_state ogg_events_st;
ogg_page og;
ogg_packet op;
ULONG granulepos;
ULONG packetno;
vorbis_info vi;
vorbis_comment vc;
vorbis_dsp_state vd;
vorbis_block vb;
OggSpStream() {}
// FIXME: error checking
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
BaseSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
vorbis_info_init(&vi);
vorbis_encode_init_vbr(&vi, wfex.nChannels, wfex.nSamplesPerSec, 0.1f);
vorbis_comment_init(&vc);
vorbis_comment_add_tag(&vc, "ENCODER", "sapicli");
vorbis_analysis_init(&vd, &vi);
vorbis_block_init(&vd, &vb);
ogg_stream_init(&ogg_voice_st, 1);
ogg_stream_init(&ogg_events_st, 2);
granulepos = 0;
packetno = 0;
{
ogg_packet header;
ogg_packet header_comm;
ogg_packet header_code;
vorbis_analysis_headerout(&vd, &vc, &header, &header_comm, &header_code);
ogg_stream_packetin(&ogg_voice_st, &header); /* automatically placed in its own page */
while(ogg_stream_flush(&ogg_voice_st, &og)) {
WriteFile(h, og.header, og.header_len, 0, 0);
WriteFile(h, og.body, og.body_len, 0, 0);
}
if(ullEventInterest != 0) {
unsigned char evntHead[8] = { 'S', 'A', 'P', 'I', 'E', 'V', 'N', 'T' };
ogg_packet p;
memset(&p, 0, sizeof(p));
p.packet = evntHead;
p.bytes = 8;
p.b_o_s = 1;
ogg_stream_packetin(&ogg_events_st, &p);
while(ogg_stream_flush(&ogg_events_st, &og)) {
WriteFile(h, og.header, og.header_len, 0, 0);
WriteFile(h, og.body, og.body_len, 0, 0);
}
}
ogg_stream_packetin(&ogg_voice_st, &header_comm);
ogg_stream_packetin(&ogg_voice_st, &header_code);
while(ogg_stream_flush(&ogg_voice_st, &og)) {
WriteFile(h, og.header, og.header_len, 0, 0);
WriteFile(h, og.body, og.body_len, 0, 0);
}
}
return S_OK;
}
HRESULT STDMETHODCALLTYPE Write(const void *buf, ULONG size, ULONG *newPos) {
int eos = 0;
if(size == 0) {
vorbis_analysis_wrote(&vd, 0);
} else {
int nSamples = size * 8 / wfex.wBitsPerSample/ wfex.nChannels;
float **buffer = vorbis_analysis_buffer(&vd, nSamples);
for(int j = 0; j < wfex.nChannels; j++) {
float *sample = buffer[j];
for(int i = 0; i < nSamples; i++) {
float sm = wfex.wBitsPerSample == 16 ? ((short *)buf)[i] / 32768.f : ((char *)buf)[i] / 256.f;
*(sample++) = sm;
}
}
vorbis_analysis_wrote(&vd, nSamples);
}
while(vorbis_analysis_blockout(&vd, &vb) == 1) {
vorbis_analysis(&vb, NULL);
vorbis_bitrate_addblock(&vb);
while(vorbis_bitrate_flushpacket(&vd, &op)) {
ogg_stream_packetin(&ogg_voice_st, &op);
while(!eos) {
int result = ogg_stream_pageout(&ogg_voice_st, &og);
if(result == 0) break;
WriteFile(h, og.header, og.header_len, 0, 0);
WriteFile(h, og.body, og.body_len, 0, 0);
if(ogg_page_eos(&og)) eos = 1;
}
}
}
if(newPos) *newPos += size;
return S_OK;
}
HRESULT STDMETHODCALLTYPE GetEventInterest(ULONGLONG* pullEventInterest) {
*pullEventInterest = 0xFFFFFFFF;
HRESULT STDMETHODCALLTYPE Close() {
Write(0, 0, 0);
if(ullEventInterest != 0) {
ogg_packet p;
memset(&p, 0, sizeof(p));
p.packet = 0;
p.bytes = 0;
p.e_o_s = 1;
ogg_stream_packetin(&ogg_events_st, &p);
while(ogg_stream_pageout(&ogg_events_st, &og)) {
WriteFile(h, og.header, og.header_len, 0, 0);
WriteFile(h, og.body, og.body_len, 0, 0);
}
}
ogg_stream_clear(&ogg_voice_st);
ogg_stream_clear(&ogg_events_st);
vorbis_block_clear(&vb);
vorbis_dsp_clear(&vd);
vorbis_comment_clear(&vc);
vorbis_info_clear(&vi);
return BaseSpStream::Close();
}
// FIXME: error checking
STDMETHODIMP writeEventData(void *buf, ULONG sz) {
return S_OK;
ogg_packet p;
p.packet = (unsigned char *)buf;
p.bytes = sz;
ogg_stream_packetin(&ogg_voice_st, &p);
while(1) {
int result = ogg_stream_pageout(&ogg_events_st, &og);
if(result == 0) break;
WriteFile(h, og.header, og.header_len, 0, 0);
WriteFile(h, og.body, og.body_len, 0, 0);
}
return S_OK;
}
};
int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, int rate, int volume, DWORD speakFlags, ULONGLONG ullEventInterest) {
int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, DWORD outType, int rate, int volume, DWORD speakFlags, DWORD samplesPerSec, WORD bitsPerSample, WORD nChannels, ULONGLONG ullEventInterest) {
HRESULT hr;
// detect output type by file extension
if(outType == 0) {
outType = 1;
if(wavFilename && wavFilename[0]) {
size_t s = wcslen(wavFilename);
if(s >= 4) {
if(!_wcsicmp(wavFilename + s - 4, L".ogg"))
outType = 3;
else if(!_wcsicmp(wavFilename + s - 4, L".wav"))
outType = 2;
}
}
}
if(addLexemes())
return 1;
@ -259,27 +546,22 @@ int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, int rate, int vo
return 1;
}
WCHAR fullVoiceId[MAX_PATH];
_snwprintf_s(fullVoiceId, MAX_PATH, _TRUNCATE, L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\%s", voiceId);
CComPtr<ISpObjectToken> voiceToken;
hr = SpGetTokenFromId(fullVoiceId, &voiceToken);
if (FAILED(hr)) {
fwprintf(stderr, L"Could not get token for voice \"%s\": %d %s\n", voiceId, hr, getErrorString(hr));
return 1;
}
if(voiceId && voiceId[0]) {
WCHAR fullVoiceId[MAX_PATH];
_snwprintf_s(fullVoiceId, MAX_PATH, _TRUNCATE, L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\%s", voiceId);
hr = voice->SetVoice(voiceToken);
if (FAILED(hr)) {
fwprintf(stderr, L"Could not set voice: %d %s\n", hr, getErrorString(hr));
return 1;
}
hr = SpGetTokenFromId(fullVoiceId, &voiceToken);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token for voice \"%s\": %d %s\n", voiceId, hr, getErrorString(hr));
return 1;
}
PooSpStream poo;
hr = voice->SetOutput(static_cast<IUnknown*>(static_cast<ISpStreamFormat*>(&poo)), FALSE);
if (FAILED(hr)) {
fwprintf(stderr, L"Could not set output: %d %s\n", hr, getErrorString(hr));
return 1;
hr = voice->SetVoice(voiceToken);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not set voice: %d %s\n", hr, getErrorString(hr));
return 1;
}
}
hr = voice->SetRate(rate);
@ -294,40 +576,79 @@ int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, int rate, int vo
return 1;
}
ISpStream *outputStream = 0;
if(outType == 2) {
HRESULT hr = ::CoCreateInstance(CLSID_SpStream, NULL, CLSCTX_ALL, __uuidof(outputStream), (void **)&outputStream);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not instantiate SpStream: %d %s\n", hr, getErrorString(hr));
}
} else if(outType == 3) {
outputStream = new OggSpStream();
} else if(outType == 1) {
outputStream = new RawSpStream();
} else {
fwprintf(stderr, L"Invalid output type %d\n", outType);
return E_INVALIDARG;
}
if(!outputStream) {
fwprintf(stderr, L"Could not initialize output stream\n");
return E_FAIL;
}
WAVEFORMATEX wfex;
wfex.wFormatTag = WAVE_FORMAT_PCM;
wfex.nChannels = nChannels;
wfex.nSamplesPerSec = samplesPerSec;
wfex.wBitsPerSample = bitsPerSample;
wfex.nBlockAlign = wfex.nChannels * wfex.wBitsPerSample / 8;
wfex.nAvgBytesPerSec = wfex.nSamplesPerSec * wfex.nBlockAlign;
wfex.cbSize = 0;
hr = outputStream->BindToFile(wavFilename, SPFM_CREATE_ALWAYS, &SPDFID_WaveFormatEx, &wfex, ullEventInterest);
if(FAILED(hr)) {
outputStream->Release();
return 1;
}
hr = voice->SetOutput(outputStream, FALSE);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not set output: %d %s\n", hr, getErrorString(hr));
return 1;
}
hr = voice->Speak(text, speakFlags, 0);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not speak: %d %s\n", hr, getErrorString(hr));
return 1;
}
voiceToken.Release();
outputStream->Close();
if(voiceId && voiceId[0])
voiceToken.Release();
return 0;
}
int wmain(int argc, WCHAR* argv[]) {
int wmain(int argc, WCHAR *argv[]) {
// https://stackoverflow.com/questions/2492077/output-unicode-strings-in-windows-console-app
(void)_setmode(_fileno(stdout), _O_U8TEXT);
const struct option long_options[] = {
{L"help", no_argument, 0, L'h'},
{L"list", no_argument, 0, L'l'},
{L"output", required_argument, 0, L'o'},
{L"voice", required_argument, 0, L'v'},
{L"type", required_argument, 0, L't'},
{L"rate", required_argument, 0, L'r'},
{L"volume", required_argument, 0, L'V'},
{L"all-events", no_argument, 0, L'a'},
{L"start-input-stream-event", no_argument, 0, L'S'},
{L"end-input-stream-event", no_argument, 0, L'E'},
{L"voice-change-event", no_argument, 0, L'C'},
{L"bookmark-event", no_argument, 0, L'B'},
{L"word-boundary-event", no_argument, 0, L'W'},
{L"phoneme-event", no_argument, 0, L'F'},
{L"sentence-boundary-event", no_argument, 0, L'N'},
{L"viseme-event", no_argument, 0, L'I'},
{L"audio-level-event", no_argument, 0, L'L'},
{0, 0, 0, 0},
{ L"help", no_argument, 0, L'h' },
{ L"list", no_argument, 0, L'l' },
{ L"output", required_argument, 0, L'o' },
{ L"out-type", required_argument, 0, L'T' },
{ L"voice", required_argument, 0, L'v' },
{ L"type", required_argument, 0, L't' },
{ L"rate", required_argument, 0, L'r' },
{ L"volume", required_argument, 0, L'V' },
{ L"sample-rate", required_argument, 0, L's' },
{ L"bits", required_argument, 0, L'b' },
{ L"channels", required_argument, 0, L'c' },
{ L"events", required_argument, 0, L'e' },
{ 0, 0, 0, 0 },
};
int help = 0;
@ -337,16 +658,19 @@ int wmain(int argc, WCHAR* argv[]) {
DWORD speakFlags = 0;
int rate = 0;
int volume = 100;
DWORD samplesPerSec = 22050;
WORD bitsPerSample = 16, nChannels = 1;
ULONGLONG ullEventInterest = 0;
DWORD outType = 0;
int option;
int option_index = 0;
while(1) {
option = getoptW_long(argc, argv, L"hlo:v:t:r:VaSECBWFNIL", long_options, &option_index);
option = getoptW_long(argc, argv, L"hlo:T:v:t:r:Vs:b:c:e:", long_options, &option_index);
if(option == -1) break;
switch (option) {
switch(option) {
case L'h':
help = 1;
break;
@ -356,6 +680,18 @@ int wmain(int argc, WCHAR* argv[]) {
case L'o':
wavFilename = optarg;
break;
case L'T':
if(!_wcsicmp(optarg, L"auto"))
outType = 0;
else if(!_wcsicmp(optarg, L"raw"))
outType = 1;
else if(!_wcsicmp(optarg, L"wav"))
outType = 2;
else if(!_wcsicmp(optarg, L"ogg"))
outType = 3;
else
help = 1;
break;
case L'v':
voice = optarg;
break;
@ -377,35 +713,20 @@ int wmain(int argc, WCHAR* argv[]) {
case L'V':
volume = wcstol(optarg, 0, 10);
break;
case L'a':
ullEventInterest = SPFEI_ALL_EVENTS;
case L's':
samplesPerSec = wcstol(optarg, 0, 10);
break;
case L'S':
ullEventInterest |= SPEI_START_INPUT_STREAM;
case L'b':
bitsPerSample = (WORD)wcstol(optarg, 0, 10);
break;
case L'E':
ullEventInterest |= SPEI_END_INPUT_STREAM;
case L'c':
nChannels = (WORD)wcstol(optarg, 0, 10);
break;
case L'C':
ullEventInterest |= SPEI_VOICE_CHANGE;
break;
case L'B':
ullEventInterest |= SPEI_TTS_BOOKMARK;
break;
case L'W':
ullEventInterest |= SPEI_WORD_BOUNDARY;
break;
case L'F':
ullEventInterest |= SPEI_PHONEME;
break;
case L'N':
ullEventInterest |= SPEI_SENTENCE_BOUNDARY;
break;
case L'I':
ullEventInterest |= SPEI_VISEME;
break;
case L'L':
ullEventInterest |= SPEI_TTS_AUDIO_LEVEL;
case L'e':
if(!_wcsicmp(optarg, L"all"))
ullEventInterest = 0xfffe;
else
ullEventInterest = wcstol(optarg, 0, 0);
break;
}
}
@ -419,21 +740,34 @@ int wmain(int argc, WCHAR* argv[]) {
L"Usage: %s --list | [options] <text>\n"
L" -h, --help Print this help.\n"
L" -l, --list List all voices.\n"
L" -o, --output=FILE Output WAV file.\n"
L" -o, --output=FILE Output file. Default is `output.wav`\n"
L" Use `-' for stdout.\n"
L" -T, --out-type=TYPE Output file type. Default is `auto'\n"
L" `wav' for RIFF .wav\n"
L" `ogg' for Ogg Vorbis\n"
L" `raw' for raw PCM samples\n"
L" `auto' to autodetect from file extension\n"
L" -v, --voice=VOICE Select voice.\n"
L" -t, --type=TYPE Input text type (PLAIN,SSML,SAPI,AUTO).\n"
L" -r, --rate=RATE Rate (speed) of speech, from -10 to 10.\n"
L" -V, --volume=VOL Volume of speech, from 0 to 100.\n"
L" -a, --all-events Log all events in the EVNT RIFF chunk.\n"
L" -S, --start-input-stream-event Log start input stream events.\n"
L" -E, --end-input-stream-event Log end input stream events.\n"
L" -C, --voice-change-event Log voice change events.\n"
L" -B, --bookmark-event Log bookmark events.\n"
L" -W, --word-boundary-event Log word boundary events.\n"
L" -F, --phoneme-event Log phoneme events.\n"
L" -N, --sentence-boundary-event Log sentence boundary events.\n"
L" -I, --viseme-event Log viseme events.\n"
L" -L, --audio-level-event Log audio level events.\n",
L" By default, events are logged into the\n"
L" output stream if it is a .wav or an .ogg\n"
L" -s, --sample-rate=HZ Sample rate of output. Default 22050.\n"
L" -b, --bits=BITS Bit depth of output. Default 16.\n"
L" -c, --channels=CHANNELS Number of audio channels of output. Default 1.\n"
L" -e, --events=MASK Select events that are output.\n"
L" Possible values:\n"
L" Stream start 2\n"
L" Stream end 4\n"
L" Voice change 8\n"
L" Bookmark 16\n"
L" Word boundary 32\n"
L" Phoneme 64\n"
L" Sentence boundary 128\n"
L" Viseme 256\n"
L" Audio level 512\n"
L" All TTS events 65534 or `all'\n",
argv[0]
);
return 1;
@ -449,7 +783,7 @@ int wmain(int argc, WCHAR* argv[]) {
if(list) {
ret = listVoices();
} else {
ret = speakToWav(argv[optind], voice, wavFilename, rate, volume, speakFlags, ullEventInterest);
ret = speakToWav(argv[optind], voice, wavFilename, outType, rate, volume, speakFlags, samplesPerSec, bitsPerSample, nChannels, ullEventInterest);
}
::CoUninitialize();

Loading…
Cancel
Save