You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
sapicli/sapicli.cpp

1319 lines
40 KiB

#include <windows.h> // System includes
#include <atlbase.h> // ATL
#include <atlcom.h>
#include <windowsx.h>
#include <wchar.h>
#include <tchar.h>
#include <sapi.h> // SAPI includes
#pragma warning(push) // Disable warning C4996: 'GetVersionExA': was declared deprecated (sphelper.h:1319)
#pragma warning(disable: 4996)
#include <sphelper.h>
#include <spddkhlp.h>
#pragma warning(pop)
#include <initguid.h>
#include <io.h>
#include <fcntl.h>
#include <vorbis/vorbisenc.h>
#include <opus.h>
#include <lame.h>
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#include "getoptw.h"
const WCHAR *getErrorString(HRESULT r) {
switch(r) {
#include "sapierr.h"
}
return L"Unknown";
}
void printJsonString(const WCHAR *in) {
if(!in) {
fwprintf(stdout, L"null");
return;
}
size_t l = wcslen(in);
fputwc(L'"', stdout);
for(size_t i = 0; i < l; i++) {
if(in[i] == L'"')
fputwc(L'\\', stdout);
else if(in[i] == L'\\')
fputwc(L'\\', stdout);
fputwc(in[i], stdout);
}
fputwc(L'"', stdout);
}
void printJsonKeyPair(const WCHAR *key, const WCHAR *value, int skipComma = 0) {
printJsonString(key);
wprintf(L": ");
printJsonString(value);
if(skipComma)
wprintf(L"\n");
else
wprintf(L",\n");
}
int listVoices() {
HRESULT hr = 0L;
CComPtr<IEnumSpObjectTokens> voicesEnum;
hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &voicesEnum);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not enumerate tokens: %d %s\n", hr, getErrorString(hr));
return 1;
}
ULONG ulCount = 0;
hr = voicesEnum->GetCount(&ulCount);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token count: %d %s\n", hr, getErrorString(hr));
return 1;
}
wprintf(L"[\n");
while(ulCount--) {
CComPtr<ISpObjectToken> cpVoiceToken;
hr = voicesEnum->Next(1, &cpVoiceToken, NULL);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not iterate voice token: %d %s\n", hr, getErrorString(hr));
return 1;
}
WCHAR *idString = 0L;
hr = cpVoiceToken->GetId(&idString);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token ID: %d %s\n", hr, getErrorString(hr));
return 1;
}
wprintf(L"{\n");
WCHAR *idBasename = 0L;
idBasename = wcsrchr(idString, '\\');
printJsonKeyPair(L"id", idBasename && idBasename[0] ? idBasename + 1 : idString);
WCHAR *descriptionString = 0L;
hr = SpGetDescription(cpVoiceToken, &descriptionString);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token description: %d %s\n", hr, getErrorString(hr));
return 1;
}
printJsonKeyPair(L"description", descriptionString);
CComPtr<ISpDataKey> cpSpAttributesKey;
hr = cpVoiceToken->OpenKey(L"Attributes", &cpSpAttributesKey);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not open attributes key: %d %s\n", hr, getErrorString(hr));
return 1;
}
WCHAR *age;
cpSpAttributesKey->GetStringValue(L"Age", &age);
printJsonKeyPair(L"age", age);
WCHAR *gender;
cpSpAttributesKey->GetStringValue(L"Gender", &gender);
printJsonKeyPair(L"gender", gender);
WCHAR *language;
cpSpAttributesKey->GetStringValue(L"Language", &language);
WCHAR strNameBuffer[LOCALE_NAME_MAX_LENGTH] = { 0 };
int langId = wcstol(language, NULL, 16);
LCIDToLocaleName(langId, strNameBuffer, LOCALE_NAME_MAX_LENGTH, 0);
printJsonKeyPair(L"language", strNameBuffer);
WCHAR *name;
cpSpAttributesKey->GetStringValue(L"Name", &name);
printJsonKeyPair(L"name", name);
WCHAR *vendor;
cpSpAttributesKey->GetStringValue(L"Vendor", &vendor);
printJsonKeyPair(L"vendor", vendor, 1);
if(ulCount > 0)
wprintf(L"},\n");
else
wprintf(L"}\n");
}
wprintf(L"]\n");
return 0;
}
int addLexemes() {
HRESULT hr;
CComPtr<ISpLexicon> cpLexicon;
hr = cpLexicon.CoCreateInstance(CLSID_SpLexicon);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not instantiate lexicon: %d %s\n", hr, getErrorString(hr));
return 1;
}
LANGID langidUS = MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US);
CComPtr<ISpPhoneConverter> cpPhoneConv;
hr = SpCreatePhoneConverter(langidUS, NULL, NULL, &cpPhoneConv);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not instantiate phoneme converter: %d %s\n", hr, getErrorString(hr));
return 1;
}
const struct {
LANGID langId;
const WCHAR *word;
const WCHAR *phone;
} lexemes[] = {
{ langidUS, L"cum", L"k uw m" },
{ langidUS, L"poo", L"p uw" },
{ langidUS, L"lol", L"l uh l" },
{ langidUS, L"lolol", L"l uh l uh l" },
{ langidUS, L"deez", L"d iy z" },
{ langidUS, L"nutz", L"n ah t s" },
{ langidUS, L"nasim", L"n ah s iy m" },
};
for(int i = 0; i < sizeof(lexemes) / sizeof(lexemes[0]); i++) {
SPPHONEID wszId[SP_MAX_PRON_LENGTH];
hr = cpPhoneConv->PhoneToId(lexemes[i].phone, wszId);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not convert phoneme \"%s\" to id: ", lexemes[i].phone);
if(hr == E_INVALIDARG)
fwprintf(stderr, L"Invalid argument");
else if(hr == SPERR_UNINITIALIZED)
fwprintf(stderr, L"Uninitialized");
else if(hr == E_FAIL)
fwprintf(stderr, L"Failed");
else fwprintf(stderr, L"%d", hr);
fwprintf(stderr, L"\n");
continue;
}
hr = cpLexicon->AddPronunciation(lexemes[i].word, lexemes[i].langId, SPPS_Noun, wszId);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not add pronounciation for word \"%s\": %d %s\n", lexemes[i].word, hr, getErrorString(hr));
continue;
}
}
return 0;
}
class BaseSpStream: public ISpStream, public ISpEventSink {
public:
LPCWSTR filename;
WAVEFORMATEX wfex;
const GUID *formatId;
ULONGLONG ullEventInterest;
HANDLE h;
BOOL isStdout;
BaseSpStream() {}
STDMETHODIMP QueryInterface(REFIID riid, void **ppv) {
if(ppv == NULL) return E_INVALIDARG;
*ppv = NULL;
if(riid == IID_IUnknown || riid == IID_ISequentialStream || riid == IID_IStream || riid == IID_ISpStreamFormat || riid == IID_ISpStream)
*ppv = static_cast<ISpStreamFormat *>(this);
else if(riid == IID_ISpEventSink)
*ppv = static_cast<ISpEventSink *>(this);
else return E_NOINTERFACE;
return S_OK;
}
STDMETHODIMP_(ULONG) AddRef(void) { return 1; }
STDMETHODIMP_(ULONG) Release(void) { return 1; }
STDMETHODIMP Read(void *, ULONG, ULONG *) { return 0; }
virtual STDMETHODIMP Write(const void *buf, ULONG size, ULONG *newPos) = 0;
STDMETHODIMP Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER *plibNewPosition) {
if(plibNewPosition)
plibNewPosition->QuadPart = dlibMove.QuadPart;
return S_OK;
}
STDMETHODIMP SetSize(ULARGE_INTEGER) { return 0; }
STDMETHODIMP CopyTo(IStream *, ULARGE_INTEGER, ULARGE_INTEGER *, ULARGE_INTEGER *) { return 0; }
STDMETHODIMP Commit(DWORD) { return 0; }
STDMETHODIMP Revert(void) { return 0; }
STDMETHODIMP LockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return 0; }
STDMETHODIMP UnlockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return 0; }
STDMETHODIMP Stat(STATSTG *, DWORD) { return 0; }
STDMETHODIMP Clone(IStream **) { return 0; }
STDMETHODIMP GetFormat(GUID *pguidFormatId, WAVEFORMATEX **format) {
*pguidFormatId = *formatId;
WAVEFORMATEX *pwfex = (WAVEFORMATEX *)::CoTaskMemAlloc(sizeof(WAVEFORMATEX));
CopyMemory(pwfex, &wfex, sizeof(WAVEFORMATEX));
*format = pwfex;
return S_OK;
}
virtual STDMETHODIMP writeEventData(void *buf, size_t sz) = 0;
// FIXME: optimize by not allocating every time
STDMETHODIMP writeSpEvent(const SPEVENT *ev) {
CSpEvent cspev;
cspev.CopyFrom(ev);
ULONG sz = cspev.SerializeSize<SPSERIALIZEDEVENT>();
BYTE *buf = new BYTE[sz];
cspev.Serialize<SPSERIALIZEDEVENT>((SPSERIALIZEDEVENT *)buf);
writeEventData(buf, sz);
delete[] buf;
return S_OK;
}
STDMETHODIMP AddEvents(const SPEVENT *pEventArray, ULONG ulCount) {
for(ULONG i = 0; i < ulCount; i++) {
const SPEVENT *ev = &pEventArray[i];
writeSpEvent(ev);
}
return S_OK;
}
STDMETHODIMP GetEventInterest(ULONGLONG *pullEventInterest) {
*pullEventInterest = ullEventInterest;
return S_OK;
}
STDMETHODIMP SetBaseStream(IStream *pStream, REFGUID rguidFormat, const WAVEFORMATEX *pWaveFormatEx) { return S_OK; }
STDMETHODIMP GetBaseStream(IStream **ppStream) { return S_OK; }
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
if(SP_IS_BAD_STRING_PTR(filename_) || eMode >= SPFM_NUM_MODES || SP_IS_BAD_OPTIONAL_READ_PTR(pFormatId))
return E_INVALIDARG;
filename = filename_;
ullEventInterest = ullEventInterest_;
formatId = pFormatId;
CopyMemory(&wfex, pWaveFormatEx, sizeof(WAVEFORMATEX));
isStdout = filename_ && filename_[0] == '-' && filename_[1] == 0;
if(isStdout) {
h = GetStdHandle(STD_OUTPUT_HANDLE);
} else {
h = CreateFileW(filename_, GENERIC_WRITE, FILE_SHARE_READ, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, 0);
if(h == INVALID_HANDLE_VALUE) {
DWORD e = GetLastError();
WCHAR buf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, buf, sizeof(buf) / sizeof(buf[0]), 0);
fwprintf(stderr, L"Could not open \"%s\" for writing: %d %s\n", filename_, e, buf);
return HRESULT_FROM_WIN32(e);
}
}
return S_OK;
}
virtual STDMETHODIMP Close(void) {
if(isStdout || !h) return S_OK;
BOOL b = CloseHandle(h);
if(b) return S_OK;
DWORD e = GetLastError();
WCHAR buf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, buf, sizeof(buf) / sizeof(buf[0]), 0);
fwprintf(stderr, L"Could not close \"%s\": %d (%s)", filename, e, buf);
return HRESULT_FROM_WIN32(e);
}
};
class RawSpStream: public BaseSpStream {
public:
HANDLE eh; // events file handle
RawSpStream(): eh(0) {}
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
isStdout = filename_ && filename_[0] == '-' && filename_[1] == 0;
if(ullEventInterest_) {
if(isStdout) {
eh = (HANDLE)_get_osfhandle(3);
} else {
fwprintf(stderr, L"Cannot select events (0x%04llx) when output is not stdout\n", ullEventInterest_);
return E_INVALIDARG;
}
}
HRESULT hr = BaseSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
if(FAILED(hr)) return hr;
return S_OK;
}
HRESULT STDMETHODCALLTYPE Write(const void *buf, ULONG size, ULONG *newPos) {
BOOL r = WriteFile(h, buf, size, newPos, 0);
if(r) return S_OK;
DWORD e = GetLastError();
WCHAR errbuf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, errbuf, sizeof(errbuf) / sizeof(errbuf[0]), 0);
fwprintf(stderr, L"Could not write audio samples to %s: %d (%s)", isStdout ? L"stdout" : filename, e, errbuf);
return HRESULT_FROM_WIN32(e);
}
STDMETHODIMP writeEventData(void *buf, size_t sz) {
if(!eh) return E_FAIL;
BOOL b = WriteFile(eh, buf, (ULONG)sz, 0, 0);
if(b) return S_OK;
DWORD e = GetLastError();
WCHAR errbuf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, errbuf, sizeof(errbuf) / sizeof(errbuf[0]), 0);
fwprintf(stderr, L"Could not write event data: %d (%s)", e, errbuf);
return HRESULT_FROM_WIN32(e);
}
};
class OggSpStream: public BaseSpStream {
public:
ogg_stream_state ogg_voice_st;
ogg_stream_state ogg_events_st;
ogg_int64_t granulepos;
ogg_int64_t packetNo, eventpacketNo;
OggSpStream(): granulepos(0), packetNo(0), eventpacketNo(0) {}
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
if(ogg_stream_init(&ogg_voice_st, 1)) {
fwprintf(stderr, L"Could not initialize ogg stream\n");
return E_FAIL;
}
if(ogg_stream_init(&ogg_events_st, 2)) {
fwprintf(stderr, L"Could not initialize ogg stream\n");
return E_FAIL;
}
granulepos = packetNo = eventpacketNo = 0;
HRESULT hr = BaseSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not bind to file %s: %d %s\n", filename_, hr, getErrorString(hr));
return hr;
}
return S_OK;
}
STDMETHODIMP writeEventHead() {
if(ullEventInterest == 0) return S_OK;
unsigned char evntHead[8] = { 'S', 'A', 'P', 'I', 'E', 'V', 'N', 'T' };
ogg_packet p;
p.packet = evntHead;
p.bytes = 8;
p.b_o_s = 1;
p.e_o_s = 0;
p.granulepos = 0;
p.packetno = eventpacketNo++;
if(ogg_stream_packetin(&ogg_events_st, &p)) {
fwprintf(stderr, L"Could not add the header packet to the events stream\n");
return E_FAIL;
}
return flushStream(&ogg_events_st);
}
STDMETHODIMP writeEventData(void *buf, size_t sz) {
if(ullEventInterest == 0) return S_OK;
ogg_packet p;
p.packet = (unsigned char *)buf;
p.bytes = (ULONG)sz;
p.e_o_s = 0;
p.b_o_s = 0;
p.granulepos = granulepos;
p.packetno = eventpacketNo++;
if(ogg_stream_packetin(&ogg_events_st, &p)) {
fwprintf(stderr, L"Could not add an event data packet of length %lu to the events stream\n", (ULONG)sz);
return E_FAIL;
}
return pageoutStream(&ogg_events_st);
}
STDMETHODIMP flushEventStream(void) {
if(ullEventInterest == 0) return S_OK;
ogg_packet p;
p.packet = 0;
p.bytes = 0;
p.b_o_s = 0;
p.e_o_s = 1;
p.granulepos = granulepos;
p.packetno = eventpacketNo++;
if(ogg_stream_packetin(&ogg_events_st, &p)) {
fwprintf(stderr, L"Could not add the final packet to the events stream\n");
return E_FAIL;
}
return flushStream(&ogg_events_st);
}
virtual STDMETHODIMP Write(const void *buf, ULONG size, ULONG *newPos) = 0;
HRESULT STDMETHODCALLTYPE Close() {
flushEventStream();
if(ogg_stream_clear(&ogg_voice_st)) {
fwprintf(stderr, L"Could not clear voice stream\n");
return E_FAIL;
}
if(ogg_stream_clear(&ogg_events_st)) {
fwprintf(stderr, L"Could not clear events stream\n");
return E_FAIL;
}
return BaseSpStream::Close();
}
STDMETHODIMP writePage(ogg_page *p) {
BOOL r = WriteFile(h, p->header, p->header_len, 0, 0);
if(r) r = WriteFile(h, p->body, p->body_len, 0, 0);
if(r) return S_OK;
DWORD e = GetLastError();
WCHAR buf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, buf, sizeof(buf) / sizeof(buf[0]), 0);
fwprintf(stderr, L"Could not write to %s: %d (%s)", isStdout ? L"stdout" : filename, e, buf);
return HRESULT_FROM_WIN32(e);
}
STDMETHODIMP pageoutStream(ogg_stream_state *os) {
ogg_page p;
while(ogg_stream_pageout(os, &p)) {
HRESULT hr = writePage(&p);
if(hr != S_OK) {
fwprintf(stderr, L"Could not write page: %d %s\n", hr, getErrorString(hr));
return hr;
}
}
return S_OK;
}
STDMETHODIMP flushStream(ogg_stream_state *os) {
ogg_page p;
while(ogg_stream_flush(os, &p)) {
HRESULT hr = writePage(&p);
if(hr != S_OK) {
fwprintf(stderr, L"Could not write page: %d %s\n", hr, getErrorString(hr));
return hr;
}
}
return S_OK;
}
};
class OggVorbisSpStream: public OggSpStream {
public:
vorbis_info vi;
vorbis_comment vc;
vorbis_dsp_state vd;
vorbis_block vb;
OggVorbisSpStream() {}
const WCHAR *getVorbisErrorString(int r) {
switch(r) {
case OV_EFAULT: return L"Internal logic fault; indicates a bug or heap / stack corruption.";
case OV_EINVAL: return L"Invalid setup request, eg, out of range argument.";
case OV_EIMPL: return L"Unimplemented mode; unable to comply with quality level request.";
}
return L"Unknown error";
}
HRESULT vorbisToHresult(int r) {
switch(r) {
case OV_EFAULT: return E_FAIL;
case OV_EINVAL: return E_INVALIDARG;
case OV_EIMPL: return E_NOTIMPL;
}
return E_FAIL;
}
STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
vorbis_info_init(&vi);
int r = vorbis_encode_init_vbr(&vi, pWaveFormatEx->nChannels, pWaveFormatEx->nSamplesPerSec, 0.1f);
if(r) {
fwprintf(stderr, L"Could not initialize vorbis encoder: %d %s\n", r, getVorbisErrorString(r));
return vorbisToHresult(r);
}
vorbis_comment_init(&vc);
vorbis_comment_add_tag(&vc, "ENCODER", "sapicli");
if(vorbis_analysis_init(&vd, &vi)) {
fwprintf(stderr, L"Could not initialize vorbis encoder's analysis state\n");
return E_FAIL;
}
if(vorbis_block_init(&vd, &vb)) {
fwprintf(stderr, L"Could not initialize vorbis_block structure\n");
return E_FAIL;
}
HRESULT hr = OggSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
if(FAILED(hr)) return hr;
ogg_packet header;
ogg_packet header_comm;
ogg_packet header_code;
r = vorbis_analysis_headerout(&vd, &vc, &header, &header_comm, &header_code);
if(r) {
fwprintf(stderr, L"Could not initialize vorbis encoder: %d %s\n", r, getVorbisErrorString(r));
return vorbisToHresult(r);
}
/* automatically placed in its own page */
if(ogg_stream_packetin(&ogg_voice_st, &header)) {
fwprintf(stderr, L"Could not add vorbis header packet to voice stream\n");
return E_FAIL;
}
hr = flushStream(&ogg_voice_st);
if(hr != S_OK) return hr;
hr = writeEventHead();
if(hr != S_OK) return hr;
if(ogg_stream_packetin(&ogg_voice_st, &header_comm)) {
fwprintf(stderr, L"Could not add vorbis comment header to voice stream\n");
return E_FAIL;
}
if(ogg_stream_packetin(&ogg_voice_st, &header_code)) {
fwprintf(stderr, L"Could not add vorbis code header to voice stream\n");
return E_FAIL;
}
return flushStream(&ogg_voice_st);
}
HRESULT STDMETHODCALLTYPE Write(const void *buf, ULONG size, ULONG *newPos) {
int r;
if(size == 0) {
r = vorbis_analysis_wrote(&vd, 0);
if(r) {
fwprintf(stderr, L"Could not set wrote 0 samples on vorbis analyzer: %d %s\n", r, getVorbisErrorString(r));
return vorbisToHresult(r);
}
} else {
int nSamples = size * 8 / wfex.wBitsPerSample / wfex.nChannels;
granulepos += nSamples;
float **buffer = vorbis_analysis_buffer(&vd, nSamples);
/* Optimized copy for common combination of bit depths and numbers of channels */
if(wfex.wBitsPerSample == 8 && wfex.nChannels == 1) {
unsigned char *srcSample = (unsigned char *)buf;
float *sample0 = buffer[0];
for(int i = 0; i < nSamples; i++) {
*(sample0++) = (*(srcSample++) - 128.f) / 128.f;
}
} else if(wfex.wBitsPerSample == 8 && wfex.nChannels == 2) {
unsigned char *srcSample = (unsigned char *)buf;
float *sample0 = buffer[0];
float *sample1 = buffer[1];
for(int i = 0; i < nSamples; i++) {
*(sample0++) = (*(srcSample++) - 128.f) / 128.f;
*(sample1++) = (*(srcSample++) - 128.f) / 128.f;
}
} else if(wfex.wBitsPerSample == 16 && wfex.nChannels == 1) {
short *srcSample = (short *)buf;
float *sample0 = buffer[0];
for(int i = 0; i < nSamples; i++) {
*(sample0++) = *(srcSample++) / 32768.f;
}
} else if(wfex.wBitsPerSample == 16 && wfex.nChannels == 2) {
short *srcSample = (short *)buf;
float *sample0 = buffer[0];
float *sample1 = buffer[1];
for(int i = 0; i < nSamples; i++) {
*(sample0++) = *(srcSample++) / 32768.f;
*(sample1++) = *(srcSample++) / 32768.f;
}
} else {
/* Generic, rarely used, slow method */
int bytesPerSample = (wfex.wBitsPerSample + 7) >> 3;
float divisor = (float)(1 << (wfex.wBitsPerSample - 1));
char *startSrcSample = (char *)buf;
int strideSkip = bytesPerSample * wfex.nChannels;
for(int j = 0; j < wfex.nChannels; j++) {
float *sample = buffer[j];
char *srcSample = startSrcSample;
for(int i = 0; i < nSamples; i++) {
LONGLONG srcSampleAccum = (srcSample[bytesPerSample - 1] < 0) ? -1 : 0;
memcpy((void *)&srcSampleAccum, srcSample, bytesPerSample);
srcSample += strideSkip;
*(sample++) = (float)srcSampleAccum / divisor;
}
startSrcSample += bytesPerSample;
}
}
r = vorbis_analysis_wrote(&vd, nSamples);
if(r) {
fwprintf(stderr, L"Could not set wrote %d samples on vorbis analyzer: %d %s\n", nSamples, r, getVorbisErrorString(r));
return vorbisToHresult(r);
}
}
int eos = 0;
while(vorbis_analysis_blockout(&vd, &vb) == 1) {
r = vorbis_analysis(&vb, NULL);
if(r) {
fwprintf(stderr, L"Could not run vorbis analysis: %d %s\n", r, getVorbisErrorString(r));
return vorbisToHresult(r);
}
r = vorbis_bitrate_addblock(&vb);
if(r) {
fwprintf(stderr, L"Could not submit block to vorbis bitrate management engine: %d %s\n", r, getVorbisErrorString(r));
return vorbisToHresult(r);
}
ogg_packet p;
while(vorbis_bitrate_flushpacket(&vd, &p)) {
if(ogg_stream_packetin(&ogg_voice_st, &p)) {
fwprintf(stderr, L"Could not add vorbis packet to voice stream\n");
return E_FAIL;
}
while(!eos) {
ogg_page p;
int result = ogg_stream_pageout(&ogg_voice_st, &p);
if(result == 0) break;
HRESULT hr = writePage(&p);
if(FAILED(hr)) return hr;
if(ogg_page_eos(&p)) eos = 1;
}
}
}
if(newPos) *newPos += size;
return S_OK;
}
HRESULT STDMETHODCALLTYPE Close() {
Write(0, 0, 0);
vorbis_block_clear(&vb);
vorbis_dsp_clear(&vd);
vorbis_comment_clear(&vc);
vorbis_info_clear(&vi);
return OggSpStream::Close();
}
};
class OggOpusSpStream: public OggSpStream {
public:
OpusEncoder *enc;
opus_int16 frame[2880 * 2]; // max frame size times two channels
int framepos;
int framesize;
OggOpusSpStream(): enc(0), framepos(0), framesize(960) {}
const WCHAR *getOpusErrorString(int err) {
switch(err) {
case OPUS_OK: return L"No error";
case OPUS_BAD_ARG: return L"One or more invalid / out of range arguments.";
case OPUS_BUFFER_TOO_SMALL: return L"Not enough bytes allocated in the buffer.";
case OPUS_INTERNAL_ERROR: return L"An internal error was detected.";
case OPUS_INVALID_PACKET: return L"The compressed data passed is corrupted.";
case OPUS_UNIMPLEMENTED: return L"Invalid / unsupported request number.";
case OPUS_INVALID_STATE: return L"An encoder or decoder structure is invalid or already freed.";
case OPUS_ALLOC_FAIL: return L"Memory allocation has failed.";
}
return L"Unknown error";
}
HRESULT opusToHresult(int err) {
switch(err) {
case OPUS_OK: return S_OK;
case OPUS_BAD_ARG: return E_INVALIDARG;
case OPUS_BUFFER_TOO_SMALL: return E_NOT_SUFFICIENT_BUFFER;
case OPUS_INTERNAL_ERROR: return E_FAIL;
case OPUS_INVALID_PACKET: return E_INVALID_PROTOCOL_FORMAT;
case OPUS_UNIMPLEMENTED: return E_NOTIMPL;
case OPUS_INVALID_STATE: return E_FAIL;
case OPUS_ALLOC_FAIL: return E_FAIL;
}
return E_FAIL;
}
STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
int err;
if(pWaveFormatEx->wBitsPerSample != 16 && pWaveFormatEx->wBitsPerSample != 8) {
fwprintf(stderr, L"Only 8 and 16 bit depth is supported for opus\n");
return E_INVALIDARG;
}
if(pWaveFormatEx->nChannels != 1 && pWaveFormatEx->nChannels != 2) {
fwprintf(stderr, L"Only 1 or 2 channels are supported for opus\n");
return E_INVALIDARG;
}
enc = opus_encoder_create(pWaveFormatEx->nSamplesPerSec, pWaveFormatEx->nChannels, OPUS_APPLICATION_VOIP, &err);
if(err != OPUS_OK) {
fwprintf(stderr, L"Error creating encoder: %d %s\n", err, getOpusErrorString(err));
return opusToHresult(err);
}
// open file only after some sanity checks above
HRESULT hr = OggSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
if(FAILED(hr)) return hr;
opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
opus_encoder_ctl(enc, OPUS_SET_BITRATE(10000));
framesize = pWaveFormatEx->nSamplesPerSec * 20 / 1000;
ogg_packet header;
int lookahead = 3840;
opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&lookahead));
unsigned char opusHeader[19] = {
'O', 'p', 'u', 's',
'H', 'e', 'a', 'd',
1,
(unsigned char)pWaveFormatEx->nChannels,
(unsigned char)(lookahead >> 0),
(unsigned char)(lookahead >> 8),
(unsigned char)(pWaveFormatEx->nSamplesPerSec >> 0),
(unsigned char)(pWaveFormatEx->nSamplesPerSec >> 8),
(unsigned char)(pWaveFormatEx->nSamplesPerSec >> 16),
(unsigned char)(pWaveFormatEx->nSamplesPerSec >> 24),
0x00, 0x00,
0
};
header.packet = opusHeader;
header.bytes = sizeof(opusHeader);
header.b_o_s = 1;
header.e_o_s = 0;
header.granulepos = 0;
header.packetno = packetNo++;
if(ogg_stream_packetin(&ogg_voice_st, &header)) {
fwprintf(stderr, L"Could not add OpusHead packet to voice stream\n");
return E_FAIL;
}
hr = flushStream(&ogg_voice_st);
if(FAILED(hr)) return hr;
hr = writeEventHead();
if(FAILED(hr)) return hr;
unsigned char opusTags[42] = {
'O', 'p', 'u', 's',
'T', 'a', 'g', 's',
7, 0, 0, 0,
's', 'a', 'p', 'i', 'c', 'l', 'i',
1, 0, 0, 0,
15, 0, 0, 0,
'E', 'N', 'C', 'O', 'D', 'E', 'R', '=',
's', 'a', 'p', 'i', 'c', 'l', 'i'
};
header.packet = opusTags;
header.bytes = sizeof(opusTags);
header.b_o_s = 0;
header.e_o_s = 0;
header.granulepos = 0;
header.packetno = packetNo++;
if(ogg_stream_packetin(&ogg_voice_st, &header)) {
fwprintf(stderr, L"Could not add OpusTags packet to voice stream\n");
return E_FAIL;
}
return flushStream(&ogg_voice_st);
}
STDMETHODIMP Write(const void *buf, ULONG size, ULONG *newPos) {
int nSamples = size * 8 / wfex.wBitsPerSample / wfex.nChannels;
unsigned char encbuf[4096];
if(wfex.wBitsPerSample != 16 && wfex.wBitsPerSample != 8 || wfex.nChannels != 1 && wfex.nChannels != 2)
return E_INVALIDARG;
short *samples16 = (short *)buf;
unsigned char *samples8 = (unsigned char *)buf;
opus_int16 *frameptr = &frame[framepos * wfex.nChannels];
for(int x = 0; x < nSamples; x++) {
if(wfex.wBitsPerSample == 16) {
*(frameptr++) = *(samples16++);
if(wfex.nChannels == 2)
*(frameptr++) = *(samples16++);
} else {
*(frameptr++) = (*samples8 - 128) << 8 | *(samples8++);
if(wfex.nChannels == 2)
*(frameptr++) = (*samples8 - 128) << 8 | *(samples8++);
}
granulepos++;
framepos++;
if(framepos < framesize)
continue;
framepos = 0;
frameptr = frame;
int encoded = opus_encode(enc, frame, framesize, encbuf, sizeof(encbuf));
if(encoded < 0) {
fwprintf(stderr, L"Could not encode %d samples of opus data %d %s\n", framesize, encoded, getOpusErrorString(encoded));
return opusToHresult(encoded);
}
if(encoded <= 2)
continue;
ogg_packet p;
p.packet = encbuf;
p.bytes = encoded;
p.b_o_s = p.e_o_s = 0;
p.granulepos = granulepos * 48000 / wfex.nSamplesPerSec;
p.packetno = packetNo++;
if(ogg_stream_packetin(&ogg_voice_st, &p)) {
fwprintf(stderr, L"Could not write opus voice packet of length %d to ogg stream\n", p.bytes);
return E_FAIL;
}
HRESULT hr;
if(granulepos % (framesize * 50))
hr = pageoutStream(&ogg_voice_st);
else
hr = flushStream(&ogg_voice_st);
if(hr != S_OK) return hr;
}
if(newPos) *newPos += size;
return S_OK;
}
STDMETHODIMP Close() {
unsigned char encbuf[4096];
memset(frame + framepos * wfex.nChannels, 0, (framesize - framepos) * wfex.nChannels);
int encoded = opus_encode(enc, frame, framesize, encbuf, sizeof(encbuf));
if(encoded < 0) {
fwprintf(stderr, L"Could not encode final %d (%d) samples of opus data %d %s\n", framesize, framepos, encoded, getOpusErrorString(encoded));
return E_FAIL;
}
ogg_packet p;
p.packet = encbuf;
p.bytes = encoded > 2 ? encoded : 0;
p.b_o_s = 0;
p.e_o_s = 1;
granulepos += framepos;
p.granulepos = granulepos * 48000 / wfex.nSamplesPerSec;
p.packetno = packetNo++;
if(ogg_stream_packetin(&ogg_voice_st, &p)) {
fwprintf(stderr, L"Could not add final packet to voice stream\n");
return E_FAIL;
}
HRESULT hr = flushStream(&ogg_voice_st);
if(hr != S_OK) return hr;
opus_encoder_destroy(enc);
enc = 0;
return OggSpStream::Close();
}
};
class Mp3SpStream: public RawSpStream {
public:
HANDLE eh; // events file handle
lame_global_flags *gfp;
unsigned char encodebuf[16384];
Mp3SpStream(): eh(0), gfp(0) {}
virtual STDMETHODIMP BindToFile(LPCWSTR filename_, SPFILEMODE eMode, const GUID *pFormatId, const WAVEFORMATEX *pWaveFormatEx, ULONGLONG ullEventInterest_) {
if(pWaveFormatEx->wBitsPerSample != 16) {
fwprintf(stderr, L"Only 16 bit samples are supported for mp3, got %d\n", pWaveFormatEx->wBitsPerSample);
return E_INVALIDARG;
}
gfp = lame_init();
if(!gfp) {
fwprintf(stderr, L"Could not init lame encoder\n");
return E_OUTOFMEMORY;
}
if(lame_set_num_channels(gfp, pWaveFormatEx->nChannels)) {
fwprintf(stderr, L"Could not set lame encoder number of channels to %d\n", pWaveFormatEx->nChannels);
return E_INVALIDARG;
}
if(lame_set_in_samplerate(gfp, pWaveFormatEx->nSamplesPerSec)) {
fwprintf(stderr, L"Could not set lame encoder sample rate to %d\n", pWaveFormatEx->nSamplesPerSec);
return E_INVALIDARG;
}
if(lame_set_brate(gfp, 128)) {
fwprintf(stderr, L"Could not set lame encoder bit rate to %d\n", 128);
return E_INVALIDARG;
}
if(lame_set_mode(gfp, pWaveFormatEx->nChannels == 2 ? STEREO : MONO)) {
fwprintf(stderr, L"Could not set lame encoder mode\n");
return E_INVALIDARG;
}
/* 2=high 5 = medium 7=low */
if(lame_set_quality(gfp, 5)) {
fwprintf(stderr, L"Could not set lame encoder quality\n");
return E_INVALIDARG;
}
if(lame_set_bWriteVbrTag(gfp, 0)) {
fwprintf(stderr, L"Could not disable writing VBR tag\n");
return E_INVALIDARG;
}
lame_mp3_tags_fid(gfp, 0);
if(lame_init_params(gfp)) {
fwprintf(stderr, L"Could not init lame params\n");
return E_FAIL;
}
return RawSpStream::BindToFile(filename_, eMode, pFormatId, pWaveFormatEx, ullEventInterest_);
}
STDMETHODIMP Close() {
int r = lame_encode_flush(gfp, encodebuf, sizeof(encodebuf));
if(r < 0) return E_FAIL;
if(r == 0) return S_OK;
if(WriteFile(h, encodebuf, r, 0, 0)) return S_OK;
DWORD e = GetLastError();
WCHAR errbuf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, errbuf, sizeof(errbuf) / sizeof(errbuf[0]), 0);
fwprintf(stderr, L"Could not write audio samples to %s: %d (%s)", isStdout ? L"stdout" : filename, e, errbuf);
return HRESULT_FROM_WIN32(e);
lame_close(gfp);
return RawSpStream::Close();
}
HRESULT STDMETHODCALLTYPE Write(const void *buf, ULONG size, ULONG *newPos) {
if(size >= 16777216) {
return E_INVALIDARG;
}
ULONG nSamples = size * 8 / wfex.wBitsPerSample / wfex.nChannels;
ULONG framesize = 1152;
for(ULONG s = 0; s < nSamples; s += framesize) {
ULONG samples = nSamples - s;
if(samples > framesize) samples = framesize;
int r;
if(wfex.nChannels > 1)
r = lame_encode_buffer_interleaved(gfp, ((short *)buf) + s * wfex.nChannels, samples, encodebuf, sizeof(encodebuf));
else
r = lame_encode_buffer(gfp, ((short *)buf) + s, 0, samples, encodebuf, sizeof(encodebuf));
if(r < 0) return E_FAIL;
if(r == 0) continue;
if(WriteFile(h, encodebuf, r, newPos, 0)) continue;
DWORD e = GetLastError();
WCHAR errbuf[MAX_PATH];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, 0, e, 0, errbuf, sizeof(errbuf) / sizeof(errbuf[0]), 0);
fwprintf(stderr, L"Could not write audio samples to %s: %d (%s)", isStdout ? L"stdout" : filename, e, errbuf);
return HRESULT_FROM_WIN32(e);
}
return S_OK;
}
};
int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, DWORD outType, int rate, int volume, DWORD speakFlags, DWORD samplesPerSec, WORD bitsPerSample, WORD nChannels, ULONGLONG ullEventInterest) {
HRESULT hr;
if(SP_IS_BAD_STRING_PTR(wavFilename)) {
fwprintf(stderr, L"Invalid filename\n");
return 1;
}
if(SP_IS_BAD_STRING_PTR(text)) {
fwprintf(stderr, L"Invalid text\n");
return 1;
}
// detect output type by file extension
if(outType == 0) {
outType = 1;
if(wavFilename && wavFilename[0]) {
size_t s = wcslen(wavFilename);
if(s >= 4) {
if(!_wcsicmp(wavFilename + s - 4, L".wav"))
outType = 2;
else if(!_wcsicmp(wavFilename + s - 4, L".ogg"))
outType = 3;
else if(!_wcsicmp(wavFilename + s - 4, L".mp3"))
outType = 5;
}
}
}
if(addLexemes())
return 1;
CComPtr<ISpVoice> voice;
hr = voice.CoCreateInstance(CLSID_SpVoice);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not create voice instance: %d %s\n", hr, getErrorString(hr));
return 1;
}
CComPtr<ISpObjectToken> voiceToken;
if(voiceId && voiceId[0]) {
WCHAR fullVoiceId[MAX_PATH];
_snwprintf_s(fullVoiceId, MAX_PATH, _TRUNCATE, L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\%s", voiceId);
hr = SpGetTokenFromId(fullVoiceId, &voiceToken);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not get token for voice \"%s\": %d %s\n", voiceId, hr, getErrorString(hr));
return 1;
}
hr = voice->SetVoice(voiceToken);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not set voice: %d %s\n", hr, getErrorString(hr));
return 1;
}
}
hr = voice->SetRate(rate);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not set rate to %d: %d %s\n", rate, hr, getErrorString(hr));
return 1;
}
hr = voice->SetVolume(volume);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not set volume to %d: %d %s\n", volume, hr, getErrorString(hr));
return 1;
}
ISpStream *outputStream = 0;
if(outType == 1) {
outputStream = new RawSpStream();
} else if(outType == 2) {
HRESULT hr = ::CoCreateInstance(CLSID_SpStream, NULL, CLSCTX_ALL, __uuidof(outputStream), (void **)&outputStream);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not instantiate SpStream: %d %s\n", hr, getErrorString(hr));
return 1;
}
} else if(outType == 3) {
outputStream = new OggVorbisSpStream();
} else if(outType == 4) {
outputStream = new OggOpusSpStream();
} else if(outType == 5) {
outputStream = new Mp3SpStream();
} else {
fwprintf(stderr, L"Invalid output type %d\n", outType);
return E_INVALIDARG;
}
if(!outputStream) {
fwprintf(stderr, L"Could not initialize output stream\n");
return E_FAIL;
}
WAVEFORMATEX wfex;
wfex.wFormatTag = WAVE_FORMAT_PCM;
wfex.nChannels = nChannels;
wfex.nSamplesPerSec = samplesPerSec;
wfex.wBitsPerSample = bitsPerSample;
wfex.nBlockAlign = wfex.nChannels * wfex.wBitsPerSample / 8;
wfex.nAvgBytesPerSec = wfex.nSamplesPerSec * wfex.nBlockAlign;
wfex.cbSize = 0;
hr = outputStream->BindToFile(wavFilename, SPFM_CREATE_ALWAYS, &SPDFID_WaveFormatEx, &wfex, ullEventInterest);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not bind to file %s: %d %s\n", wavFilename, hr, getErrorString(hr));
outputStream->Release();
return 1;
}
hr = voice->SetOutput(outputStream, FALSE);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not set output: %d %s\n", hr, getErrorString(hr));
return 1;
}
hr = voice->Speak(text, speakFlags, 0);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not speak: %d %s\n", hr, getErrorString(hr));
return 1;
}
// Release here so the destructor doesn't do it after we've closed the output file
voice.Release();
hr = outputStream->Close();
if(FAILED(hr)) {
fwprintf(stderr, L"Could not close %s: %d %s\n", wavFilename, hr, getErrorString(hr));
return 1;
}
if(voiceId && voiceId[0])
voiceToken.Release();
return 0;
}
int wmain(int argc, WCHAR *argv[]) {
// https://stackoverflow.com/questions/2492077/output-unicode-strings-in-windows-console-app
(void)_setmode(_fileno(stdout), _O_U8TEXT);
const struct option long_options[] = {
{ L"help", no_argument, 0, L'h' },
{ L"list", no_argument, 0, L'l' },
{ L"output", required_argument, 0, L'o' },
{ L"out-type", required_argument, 0, L'T' },
{ L"voice", required_argument, 0, L'v' },
{ L"type", required_argument, 0, L't' },
{ L"rate", required_argument, 0, L'r' },
{ L"volume", required_argument, 0, L'V' },
{ L"sample-rate", required_argument, 0, L's' },
{ L"bits", required_argument, 0, L'b' },
{ L"channels", required_argument, 0, L'c' },
{ L"events", required_argument, 0, L'e' },
{ 0, 0, 0, 0 },
};
int help = 0;
int list = 0;
WCHAR *voice = 0;
WCHAR *wavFilename = 0;
DWORD speakFlags = 0;
int rate = 0;
int volume = 100;
DWORD samplesPerSec = 22050;
WORD bitsPerSample = 16, nChannels = 1;
ULONGLONG ullEventInterest = 0;
DWORD outType = 0;
int option;
int option_index = 0;
while(1) {
option = getoptW_long(argc, argv, L"hlo:T:v:t:r:Vs:b:c:e:", long_options, &option_index);
if(option == L'?') {
return 1;
}
if(option == -1) break;
switch(option) {
case L'h':
help = 1;
break;
case L'l':
list = 1;
break;
case L'o':
wavFilename = optarg;
break;
case L'T':
if(!_wcsicmp(optarg, L"auto"))
outType = 0;
else if(!_wcsicmp(optarg, L"raw"))
outType = 1;
else if(!_wcsicmp(optarg, L"wav"))
outType = 2;
else if(!_wcsicmp(optarg, L"ogg") || !_wcsicmp(optarg, L"ogg+vorbis"))
outType = 3;
else if(!_wcsicmp(optarg, L"ogg+opus"))
outType = 4;
else if(!_wcsicmp(optarg, L"mp3"))
outType = 5;
else
help = 1;
break;
case L'v':
voice = optarg;
break;
case L't':
if(!_wcsicmp(optarg, L"ssml"))
speakFlags = SPF_IS_XML | SPF_PARSE_SSML;
else if(!_wcsicmp(optarg, L"sapi"))
speakFlags = SPF_IS_XML | SPF_PARSE_SAPI;
else if(!_wcsicmp(optarg, L"auto"))
speakFlags = SPF_IS_XML | SPF_PARSE_AUTODETECT;
else if(!_wcsicmp(optarg, L"text"))
speakFlags = SPF_IS_NOT_XML;
else
help = 1;
break;
case L'r':
rate = wcstol(optarg, 0, 10);
break;
case L'V':
volume = wcstol(optarg, 0, 10);
break;
case L's':
samplesPerSec = wcstol(optarg, 0, 10);
break;
case L'b':
bitsPerSample = (WORD)wcstol(optarg, 0, 10);
break;
case L'c':
nChannels = (WORD)wcstol(optarg, 0, 10);
break;
case L'e':
if(!_wcsicmp(optarg, L"all"))
ullEventInterest = 0xfffe;
else
ullEventInterest = wcstol(optarg, 0, 0);
break;
}
}
if(!list && optind >= argc)
help = 1;
if(help) {
fwprintf(
stderr,
L"Usage: %s --list | [options] <text>\n"
L" -h, --help Print this help.\n"
L" -l, --list List all voices.\n"
L" -o, --output=FILE Output file. Default is `output.wav`\n"
L" Use `-' for stdout.\n"
L" -T, --out-type=TYPE Output file type. Default is `auto'\n"
L" `wav' for RIFF .wav\n"
L" `ogg' or `ogg+vorbis' for Ogg Vorbis\n"
L" `ogg+opus' for Ogg Opus\n"
L" `mp3' for MP3\n"
L" `raw' for raw PCM samples\n"
L" `auto' to autodetect from file extension\n"
L" -v, --voice=VOICE Select voice.\n"
L" -r, --rate=RATE Rate (speed) of speech, from -10 to 10.\n"
L" -t, --type=TYPE Input text type (PLAIN,SSML,SAPI,AUTO).\n"
L" -V, --volume=VOL Volume of speech, from 0 to 100.\n"
L" -s, --sample-rate=HZ Sample rate of output. Default 22050.\n"
L" -b, --bits=BITS Bit depth of output. Default 16.\n"
L" -c, --channels=CHANNELS Number of audio channels of output. Default 1.\n"
L" -e, --events=MASK Select events that are output.\n"
L" Possible values, bitwise ORed:\n"
L" Stream start 2\n"
L" Stream end 4\n"
L" Voice change 8\n"
L" Bookmark 16\n"
L" Word boundary 32\n"
L" Phoneme 64\n"
L" Sentence boundary 128\n"
L" Viseme 256\n"
L" Audio level 512\n"
L" All TTS events 65534 or `all'\n"
L" By default, events are logged into the\n"
L" output stream if it is a .wav or an .ogg.\n"
L" If output is stdout (`-'), and the event mask\n"
L" is non zero, events are output on\n"
L" file descriptor 3.\n",
argv[0]
);
return 1;
}
HRESULT hr = ::CoInitialize(NULL);
if(FAILED(hr)) {
fwprintf(stderr, L"Could not initialize COM: %x %s\n", hr, getErrorString(hr));
return 1;
}
int ret = 0;
if(list) {
ret = listVoices();
} else {
ret = speakToWav(argv[optind], voice, wavFilename, outType, rate, volume, speakFlags, samplesPerSec, bitsPerSample, nChannels, ullEventInterest);
}
::CoUninitialize();
return ret;
}