@ -14,6 +14,9 @@
# include <io.h>
# include <fcntl.h>
# include <vorbis/vorbisenc.h>
# include <opus.h>
# include "getoptw.h"
const WCHAR * getErrorString ( HRESULT r ) {
@ -23,8 +26,8 @@ const WCHAR *getErrorString(HRESULT r) {
return L " Unknown " ;
}
void printJsonString ( const WCHAR * in ) {
if ( ! in ) {
void printJsonString ( const WCHAR * in ) {
if ( ! in ) {
fwprintf ( stdout , L " null " ) ;
return ;
}
@ -39,9 +42,9 @@ void printJsonString(const WCHAR* in) {
fputwc ( in [ i ] , stdout ) ;
}
fputwc ( L ' " ' , stdout ) ;
}
}
void printJsonKeyPair ( const WCHAR * key , const WCHAR * value , int skipComma = 0 ) {
void printJsonKeyPair ( const WCHAR * key , const WCHAR * value , int skipComma = 0 ) {
printJsonString ( key ) ;
wprintf ( L " : " ) ;
printJsonString ( value ) ;
@ -78,18 +81,18 @@ int listVoices() {
return 1 ;
}
WCHAR * idString = 0L ;
WCHAR * idString = 0L ;
hr = cpVoiceToken - > GetId ( & idString ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not get token ID: %d %s \n " , hr , getErrorString ( hr ) ) ;
return 1 ;
}
wprintf ( L " { \n " ) ;
WCHAR * idBasename = 0L ;
WCHAR * idBasename = 0L ;
idBasename = wcsrchr ( idString , ' \\ ' ) ;
printJsonKeyPair ( L " id " , idBasename & & idBasename [ 0 ] ? idBasename + 1 : idString ) ;
WCHAR * descriptionString = 0L ;
WCHAR * descriptionString = 0L ;
hr = SpGetDescription ( cpVoiceToken , & descriptionString ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not get token description: %d %s \n " , hr , getErrorString ( hr ) ) ;
@ -104,26 +107,26 @@ int listVoices() {
return 1 ;
}
WCHAR * age ;
WCHAR * age ;
cpSpAttributesKey - > GetStringValue ( L " Age " , & age ) ;
printJsonKeyPair ( L " age " , age ) ;
WCHAR * gender ;
WCHAR * gender ;
cpSpAttributesKey - > GetStringValue ( L " Gender " , & gender ) ;
printJsonKeyPair ( L " gender " , gender ) ;
WCHAR * language ;
WCHAR * language ;
cpSpAttributesKey - > GetStringValue ( L " Language " , & language ) ;
WCHAR strNameBuffer [ LOCALE_NAME_MAX_LENGTH ] = { 0 } ;
WCHAR strNameBuffer [ LOCALE_NAME_MAX_LENGTH ] = { 0 } ;
int langId = wcstol ( language , NULL , 16 ) ;
LCIDToLocaleName ( langId , strNameBuffer , LOCALE_NAME_MAX_LENGTH , 0 ) ;
printJsonKeyPair ( L " language " , strNameBuffer ) ;
WCHAR * name ;
WCHAR * name ;
cpSpAttributesKey - > GetStringValue ( L " Name " , & name ) ;
printJsonKeyPair ( L " name " , name ) ;
WCHAR * vendor ;
WCHAR * vendor ;
cpSpAttributesKey - > GetStringValue ( L " Vendor " , & vendor ) ;
printJsonKeyPair ( L " vendor " , vendor , 1 ) ;
@ -158,8 +161,8 @@ int addLexemes() {
const struct {
LANGID langId ;
const WCHAR * word ;
const WCHAR * phone ;
const WCHAR * word ;
const WCHAR * phone ;
} lexemes [ ] = {
{ langidUS , L " cum " , L " k uw m " } ,
{ langidUS , L " poo " , L " p uw " } ,
@ -196,59 +199,343 @@ int addLexemes() {
return 0 ;
}
class Poo SpStream: public ISpStream Format , public ISpEventSink {
class Base SpStream: public ISpStream , public ISpEventSink {
public :
CComPtr < ISpEventSink > sink ;
LPCWSTR filename ;
WAVEFORMATEX wfex ;
const GUID * formatId ;
ULONGLONG ullEventInterest ;
HANDLE h ;
PooSpStream ( ) {
h = GetStdHandle ( STD_OUTPUT_HANDLE ) ;
}
BOOL isStdout ;
BaseSpStream ( ) { }
STDMETHODIMP QueryInterface ( REFIID riid , void * * ppv ) {
if ( ppv = = NULL ) return E_INVALIDARG ;
STDMETHODIMP QueryInterface ( REFIID riid , void * * ppv ) {
if ( ppv = = NULL ) return E_INVALIDARG ;
* ppv = NULL ;
if ( riid = = IID_IUnknown | | riid = = IID_ISequentialStream | | riid = = IID_IStream | | riid = = IID_ISpStreamFormat )
* ppv = static_cast < ISpStreamFormat * > ( this ) ;
else if ( riid = = IID_ISpEventSink )
* ppv = static_cast < ISpEventSink * > ( this ) ;
if ( riid = = IID_IUnknown | | riid = = IID_ISequentialStream | | riid = = IID_IStream | | riid = = IID_ISpStreamFormat | | riid = = IID_ISpStream )
* ppv = static_cast < ISpStreamFormat * > ( this ) ;
else if ( riid = = IID_ISpEventSink )
* ppv = static_cast < ISpEventSink * > ( this ) ;
else return E_NOINTERFACE ;
return S_OK ;
}
STDMETHODIMP_ ( ULONG ) AddRef ( void ) { return 1 ; }
STDMETHODIMP_ ( ULONG ) Release ( void ) { return 1 ; }
HRESULT STDMETHODCALLTYPE Read ( void * , ULONG , ULONG * ) { wprintf ( L " Read \n " ) ; return 0 ; }
HRESULT STDMETHODCALLTYPE Write ( const void * buf , ULONG size , ULONG * newPos ) {
WriteFile ( h , buf , size , newPos , NULL ) ;
return S_OK ;
}
HRESULT STDMETHODCALLTYPE Seek ( LARGE_INTEGER dlibMove , DWORD dwOrigin , ULARGE_INTEGER * plibNewPosition ) {
STDMETHODIMP Read ( void * , ULONG , ULONG * ) { return 0 ; }
virtual STDMETHODIMP Write ( const void * buf , ULONG size , ULONG * newPos ) = 0 ;
STDMETHODIMP Seek ( LARGE_INTEGER dlibMove , DWORD dwOrigin , ULARGE_INTEGER * plibNewPosition ) {
if ( plibNewPosition )
plibNewPosition - > QuadPart = dlibMove . QuadPart ;
return S_OK ;
}
HRESULT STDMETHODCALLTYPE SetSize ( ULARGE_INTEGER ) { return 0 ; }
HRESULT STDMETHODCALLTYPE CopyTo ( IStream * , ULARGE_INTEGER , ULARGE_INTEGER * , ULARGE_INTEGER * ) { return 0 ; }
HRESULT STDMETHODCALLTYPE Commit ( DWORD ) { return 0 ; }
HRESULT STDMETHODCALLTYPE Revert ( void ) { return 0 ; }
HRESULT STDMETHODCALLTYPE LockRegion ( ULARGE_INTEGER , ULARGE_INTEGER , DWORD ) { return 0 ; }
HRESULT STDMETHODCALLTYPE UnlockRegion ( ULARGE_INTEGER , ULARGE_INTEGER , DWORD ) { return 0 ; }
HRESULT STDMETHODCALLTYPE Stat ( STATSTG * , DWORD ) { return 0 ; }
HRESULT STDMETHODCALLTYPE Clone ( IStream * * ) { return 0 ; }
HRESULT STDMETHODCALLTYPE GetFormat ( GUID * pguidFormatId , WAVEFORMATEX * * format ) {
return SpConvertStreamFormatEnum ( SPSF_16kHz16BitMono , pguidFormatId , format ) ;
}
HRESULT STDMETHODCALLTYPE AddEvents ( const SPEVENT * pEventArray , ULONG ulCount ) {
STDMETHODIMP SetSize ( ULARGE_INTEGER ) { return 0 ; }
STDMETHODIMP CopyTo ( IStream * , ULARGE_INTEGER , ULARGE_INTEGER * , ULARGE_INTEGER * ) { return 0 ; }
STDMETHODIMP Commit ( DWORD ) { return 0 ; }
STDMETHODIMP Revert ( void ) { return 0 ; }
STDMETHODIMP LockRegion ( ULARGE_INTEGER , ULARGE_INTEGER , DWORD ) { return 0 ; }
STDMETHODIMP UnlockRegion ( ULARGE_INTEGER , ULARGE_INTEGER , DWORD ) { return 0 ; }
STDMETHODIMP Stat ( STATSTG * , DWORD ) { return 0 ; }
STDMETHODIMP Clone ( IStream * * ) { return 0 ; }
STDMETHODIMP GetFormat ( GUID * pguidFormatId , WAVEFORMATEX * * format ) {
* pguidFormatId = * formatId ;
WAVEFORMATEX * pwfex = ( WAVEFORMATEX * ) : : CoTaskMemAlloc ( sizeof ( WAVEFORMATEX ) ) ;
CopyMemory ( pwfex , & wfex , sizeof ( WAVEFORMATEX ) ) ;
* format = pwfex ;
return S_OK ;
}
virtual STDMETHODIMP writeEventData ( void * buf , ULONG sz ) = 0 ;
const WCHAR * event_names [ 15 ] = {
L " undefined " ,
L " startInputStream " ,
L " endInputStream " ,
L " voiceChange " ,
L " ttsBookmark " ,
L " wordBoundary " ,
L " phoneme " ,
L " sentenceBoundary " ,
L " viseme " ,
L " ttsAudioLevel " ,
} ;
STDMETHODIMP writeJsonEvent ( const SPEVENT * ev ) {
WCHAR buf [ MAX_PATH ] ;
ULONGLONG timeMs = ev - > ullAudioStreamOffset * 1000 / 16000 ;
_snwprintf_s ( buf , sizeof ( buf ) / sizeof ( buf [ 0 ] ) , L " { \" type \" : \" %s \" , \" timeMs \" :%llu} \n " , event_names [ ev - > eEventId < 15 ? ev - > eEventId : 0 ] , timeMs ) ;
writeEventData ( buf , wcslen ( buf ) * sizeof ( buf [ 0 ] ) ) ;
return S_OK ;
}
// FIXME: optimize by not allocating every time
STDMETHODIMP writeSpEvent ( const SPEVENT * ev ) {
CSpEvent cspev ;
cspev . CopyFrom ( ev ) ;
ULONG sz = cspev . SerializeSize < SPSERIALIZEDEVENT > ( ) ;
BYTE * buf = new BYTE [ sz ] ;
cspev . Serialize < SPSERIALIZEDEVENT > ( ( SPSERIALIZEDEVENT * ) buf ) ;
writeEventData ( buf , sz ) ;
delete [ ] buf ;
return S_OK ;
}
STDMETHODIMP AddEvents ( const SPEVENT * pEventArray , ULONG ulCount ) {
for ( ULONG i = 0 ; i < ulCount ; i + + ) {
const SPEVENT * ev = & pEventArray [ i ] ;
writeSpEvent ( ev ) ;
}
return S_OK ;
}
STDMETHODIMP GetEventInterest ( ULONGLONG * pullEventInterest ) {
* pullEventInterest = ullEventInterest ;
return S_OK ;
}
STDMETHODIMP SetBaseStream ( IStream * pStream , REFGUID rguidFormat , const WAVEFORMATEX * pWaveFormatEx ) {
wprintf ( L " SetBaseStream \n " ) ;
return S_OK ;
}
STDMETHODIMP GetBaseStream ( IStream * * ppStream ) {
wprintf ( L " GetBaseStream \n " ) ;
return S_OK ;
}
virtual STDMETHODIMP BindToFile ( LPCWSTR filename_ , SPFILEMODE eMode , const GUID * pFormatId , const WAVEFORMATEX * pWaveFormatEx , ULONGLONG ullEventInterest_ ) {
wprintf ( L " BindToFile filename_= \" %s \" ullEventInterest=0x%04llx \n " , filename_ , ullEventInterest_ ) ;
filename = filename_ ;
ullEventInterest = ullEventInterest_ ;
formatId = pFormatId ;
CopyMemory ( & wfex , pWaveFormatEx , sizeof ( WAVEFORMATEX ) ) ;
isStdout = filename_ & & filename [ 0 ] = = ' - ' & & filename [ 1 ] = = 0 ;
if ( isStdout ) {
h = GetStdHandle ( STD_OUTPUT_HANDLE ) ;
} else {
h = CreateFileW ( filename , GENERIC_WRITE , FILE_SHARE_READ , 0 , CREATE_ALWAYS , FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN , 0 ) ;
if ( h = = INVALID_HANDLE_VALUE ) {
DWORD e = GetLastError ( ) ;
WCHAR buf [ MAX_PATH ] ;
FormatMessage ( FORMAT_MESSAGE_FROM_SYSTEM , 0 , e , 0 , buf , sizeof ( buf ) / sizeof ( buf [ 0 ] ) , 0 ) ;
fwprintf ( stderr , L " Could not open \" %s \" for writing: %d %s \n " , filename_ , e , buf ) ;
return HRESULT_FROM_WIN32 ( e ) ;
}
}
return S_OK ;
}
virtual STDMETHODIMP Close ( void ) {
wprintf ( L " Close \n " ) ;
if ( ! isStdout & & h ) {
BOOL b = CloseHandle ( h ) ;
if ( ! b ) {
DWORD e = GetLastError ( ) ;
WCHAR buf [ MAX_PATH ] ;
FormatMessage ( FORMAT_MESSAGE_FROM_SYSTEM , 0 , e , 0 , buf , sizeof ( buf ) / sizeof ( buf [ 0 ] ) , 0 ) ;
fwprintf ( stderr , L " Could not close \" %s \" : %d (%s) " , filename , e , buf ) ;
return HRESULT_FROM_WIN32 ( e ) ;
}
}
return S_OK ;
}
} ;
class RawSpStream : public BaseSpStream {
public :
HANDLE eh ; // events file handle
RawSpStream ( ) { }
virtual STDMETHODIMP BindToFile ( LPCWSTR filename_ , SPFILEMODE eMode , const GUID * pFormatId , const WAVEFORMATEX * pWaveFormatEx , ULONGLONG ullEventInterest_ ) {
HRESULT hr = BaseSpStream : : BindToFile ( filename_ , eMode , pFormatId , pWaveFormatEx , ullEventInterest_ ) ;
if ( FAILED ( hr ) ) return hr ;
if ( isStdout ) {
eh = ( HANDLE ) _get_osfhandle ( 3 ) ;
} else if ( ullEventInterest ) {
fwprintf ( stderr , L " Cannot select events when output is not stdout \n " ) ;
return E_INVALIDARG ;
}
return S_OK ;
}
HRESULT STDMETHODCALLTYPE Write ( const void * buf , ULONG size , ULONG * newPos ) {
WriteFile ( h , buf , size , newPos , 0 ) ;
return S_OK ;
}
// FIXME: error checking
STDMETHODIMP writeEventData ( void * buf , ULONG sz ) {
if ( ! eh ) return E_FAIL ;
WriteFile ( eh , buf , sz , 0 , 0 ) ;
return S_OK ;
}
} ;
class OggSpStream : public BaseSpStream {
public :
ogg_stream_state ogg_voice_st ;
ogg_stream_state ogg_events_st ;
ogg_page og ;
ogg_packet op ;
ULONG granulepos ;
ULONG packetno ;
vorbis_info vi ;
vorbis_comment vc ;
vorbis_dsp_state vd ;
vorbis_block vb ;
OggSpStream ( ) { }
// FIXME: error checking
virtual STDMETHODIMP BindToFile ( LPCWSTR filename_ , SPFILEMODE eMode , const GUID * pFormatId , const WAVEFORMATEX * pWaveFormatEx , ULONGLONG ullEventInterest_ ) {
BaseSpStream : : BindToFile ( filename_ , eMode , pFormatId , pWaveFormatEx , ullEventInterest_ ) ;
vorbis_info_init ( & vi ) ;
vorbis_encode_init_vbr ( & vi , wfex . nChannels , wfex . nSamplesPerSec , 0.1f ) ;
vorbis_comment_init ( & vc ) ;
vorbis_comment_add_tag ( & vc , " ENCODER " , " sapicli " ) ;
vorbis_analysis_init ( & vd , & vi ) ;
vorbis_block_init ( & vd , & vb ) ;
ogg_stream_init ( & ogg_voice_st , 1 ) ;
ogg_stream_init ( & ogg_events_st , 2 ) ;
granulepos = 0 ;
packetno = 0 ;
{
ogg_packet header ;
ogg_packet header_comm ;
ogg_packet header_code ;
vorbis_analysis_headerout ( & vd , & vc , & header , & header_comm , & header_code ) ;
ogg_stream_packetin ( & ogg_voice_st , & header ) ; /* automatically placed in its own page */
while ( ogg_stream_flush ( & ogg_voice_st , & og ) ) {
WriteFile ( h , og . header , og . header_len , 0 , 0 ) ;
WriteFile ( h , og . body , og . body_len , 0 , 0 ) ;
}
if ( ullEventInterest ! = 0 ) {
unsigned char evntHead [ 8 ] = { ' S ' , ' A ' , ' P ' , ' I ' , ' E ' , ' V ' , ' N ' , ' T ' } ;
ogg_packet p ;
memset ( & p , 0 , sizeof ( p ) ) ;
p . packet = evntHead ;
p . bytes = 8 ;
p . b_o_s = 1 ;
ogg_stream_packetin ( & ogg_events_st , & p ) ;
while ( ogg_stream_flush ( & ogg_events_st , & og ) ) {
WriteFile ( h , og . header , og . header_len , 0 , 0 ) ;
WriteFile ( h , og . body , og . body_len , 0 , 0 ) ;
}
}
ogg_stream_packetin ( & ogg_voice_st , & header_comm ) ;
ogg_stream_packetin ( & ogg_voice_st , & header_code ) ;
while ( ogg_stream_flush ( & ogg_voice_st , & og ) ) {
WriteFile ( h , og . header , og . header_len , 0 , 0 ) ;
WriteFile ( h , og . body , og . body_len , 0 , 0 ) ;
}
}
return S_OK ;
}
HRESULT STDMETHODCALLTYPE Write ( const void * buf , ULONG size , ULONG * newPos ) {
int eos = 0 ;
if ( size = = 0 ) {
vorbis_analysis_wrote ( & vd , 0 ) ;
} else {
int nSamples = size * 8 / wfex . wBitsPerSample / wfex . nChannels ;
float * * buffer = vorbis_analysis_buffer ( & vd , nSamples ) ;
for ( int j = 0 ; j < wfex . nChannels ; j + + ) {
float * sample = buffer [ j ] ;
for ( int i = 0 ; i < nSamples ; i + + ) {
float sm = wfex . wBitsPerSample = = 16 ? ( ( short * ) buf ) [ i ] / 32768.f : ( ( char * ) buf ) [ i ] / 256.f ;
* ( sample + + ) = sm ;
}
}
vorbis_analysis_wrote ( & vd , nSamples ) ;
}
while ( vorbis_analysis_blockout ( & vd , & vb ) = = 1 ) {
vorbis_analysis ( & vb , NULL ) ;
vorbis_bitrate_addblock ( & vb ) ;
while ( vorbis_bitrate_flushpacket ( & vd , & op ) ) {
ogg_stream_packetin ( & ogg_voice_st , & op ) ;
while ( ! eos ) {
int result = ogg_stream_pageout ( & ogg_voice_st , & og ) ;
if ( result = = 0 ) break ;
WriteFile ( h , og . header , og . header_len , 0 , 0 ) ;
WriteFile ( h , og . body , og . body_len , 0 , 0 ) ;
if ( ogg_page_eos ( & og ) ) eos = 1 ;
}
}
}
if ( newPos ) * newPos + = size ;
return S_OK ;
}
HRESULT STDMETHODCALLTYPE GetEventInterest ( ULONGLONG * pullEventInterest ) {
* pullEventInterest = 0xFFFFFFFF ;
HRESULT STDMETHODCALLTYPE Close ( ) {
Write ( 0 , 0 , 0 ) ;
if ( ullEventInterest ! = 0 ) {
ogg_packet p ;
memset ( & p , 0 , sizeof ( p ) ) ;
p . packet = 0 ;
p . bytes = 0 ;
p . e_o_s = 1 ;
ogg_stream_packetin ( & ogg_events_st , & p ) ;
while ( ogg_stream_pageout ( & ogg_events_st , & og ) ) {
WriteFile ( h , og . header , og . header_len , 0 , 0 ) ;
WriteFile ( h , og . body , og . body_len , 0 , 0 ) ;
}
}
ogg_stream_clear ( & ogg_voice_st ) ;
ogg_stream_clear ( & ogg_events_st ) ;
vorbis_block_clear ( & vb ) ;
vorbis_dsp_clear ( & vd ) ;
vorbis_comment_clear ( & vc ) ;
vorbis_info_clear ( & vi ) ;
return BaseSpStream : : Close ( ) ;
}
// FIXME: error checking
STDMETHODIMP writeEventData ( void * buf , ULONG sz ) {
return S_OK ;
ogg_packet p ;
p . packet = ( unsigned char * ) buf ;
p . bytes = sz ;
ogg_stream_packetin ( & ogg_voice_st , & p ) ;
while ( 1 ) {
int result = ogg_stream_pageout ( & ogg_events_st , & og ) ;
if ( result = = 0 ) break ;
WriteFile ( h , og . header , og . header_len , 0 , 0 ) ;
WriteFile ( h , og . body , og . body_len , 0 , 0 ) ;
}
return S_OK ;
}
} ;
int speakToWav ( WCHAR * text , WCHAR * voiceId , WCHAR * wavFilename , int rate , int volume , DWORD speakFlags , ULONGLONG ullEventInterest ) {
int speakToWav ( WCHAR * text , WCHAR * voiceId , WCHAR * wavFilename , DWORD outType , int rate , int volume , DWORD speakFlag s, DWORD samplesPerSec , WORD bitsPerSample , WORD nChannel s, ULONGLONG ullEventInterest ) {
HRESULT hr ;
// detect output type by file extension
if ( outType = = 0 ) {
outType = 1 ;
if ( wavFilename & & wavFilename [ 0 ] ) {
size_t s = wcslen ( wavFilename ) ;
if ( s > = 4 ) {
if ( ! _wcsicmp ( wavFilename + s - 4 , L " .ogg " ) )
outType = 3 ;
else if ( ! _wcsicmp ( wavFilename + s - 4 , L " .wav " ) )
outType = 2 ;
}
}
}
if ( addLexemes ( ) )
return 1 ;
@ -259,27 +546,22 @@ int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, int rate, int vo
return 1 ;
}
WCHAR fullVoiceId [ MAX_PATH ] ;
_snwprintf_s ( fullVoiceId , MAX_PATH , _TRUNCATE , L " HKEY_LOCAL_MACHINE \\ SOFTWARE \\ Microsoft \\ Speech \\ Voices \\ Tokens \\ %s " , voiceId ) ;
CComPtr < ISpObjectToken > voiceToken ;
hr = SpGetTokenFromId ( fullVoiceId , & voiceToken ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not get token for voice \" %s \" : %d %s \n " , voiceId , hr , getErrorString ( hr ) ) ;
return 1 ;
}
if ( voiceId & & voiceId [ 0 ] ) {
WCHAR fullVoiceId [ MAX_PATH ] ;
_snwprintf_s ( fullVoiceId , MAX_PATH , _TRUNCATE , L " HKEY_LOCAL_MACHINE \\ SOFTWARE \\ Microsoft \\ Speech \\ Voices \\ Tokens \\ %s " , voiceId ) ;
hr = voice - > SetVoice ( voiceToken ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not set voice: %d %s\n " , hr , getErrorString ( hr ) ) ;
return 1 ;
}
hr = SpGetTokenFromId ( fullVoiceId , & voiceToken ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not get token for voice \" %s \" : %d %s \n " , voiceId , hr , getErrorString ( hr ) ) ;
return 1 ;
}
PooSpStream poo ;
hr = voice - > SetOutput ( static_cast < IUnknown * > ( static_cast < ISpStreamFormat * > ( & poo ) ) , FALSE ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not set output: %d %s \n " , hr , getErrorString ( hr ) ) ;
return 1 ;
hr = voice - > SetVoice ( voiceToken ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not set voice: %d %s \n " , hr , getErrorString ( hr ) ) ;
return 1 ;
}
}
hr = voice - > SetRate ( rate ) ;
@ -294,40 +576,79 @@ int speakToWav(WCHAR *text, WCHAR *voiceId, WCHAR *wavFilename, int rate, int vo
return 1 ;
}
ISpStream * outputStream = 0 ;
if ( outType = = 2 ) {
HRESULT hr = : : CoCreateInstance ( CLSID_SpStream , NULL , CLSCTX_ALL , __uuidof ( outputStream ) , ( void * * ) & outputStream ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not instantiate SpStream: %d %s \n " , hr , getErrorString ( hr ) ) ;
}
} else if ( outType = = 3 ) {
outputStream = new OggSpStream ( ) ;
} else if ( outType = = 1 ) {
outputStream = new RawSpStream ( ) ;
} else {
fwprintf ( stderr , L " Invalid output type %d \n " , outType ) ;
return E_INVALIDARG ;
}
if ( ! outputStream ) {
fwprintf ( stderr , L " Could not initialize output stream \n " ) ;
return E_FAIL ;
}
WAVEFORMATEX wfex ;
wfex . wFormatTag = WAVE_FORMAT_PCM ;
wfex . nChannels = nChannels ;
wfex . nSamplesPerSec = samplesPerSec ;
wfex . wBitsPerSample = bitsPerSample ;
wfex . nBlockAlign = wfex . nChannels * wfex . wBitsPerSample / 8 ;
wfex . nAvgBytesPerSec = wfex . nSamplesPerSec * wfex . nBlockAlign ;
wfex . cbSize = 0 ;
hr = outputStream - > BindToFile ( wavFilename , SPFM_CREATE_ALWAYS , & SPDFID_WaveFormatEx , & wfex , ullEventInterest ) ;
if ( FAILED ( hr ) ) {
outputStream - > Release ( ) ;
return 1 ;
}
hr = voice - > SetOutput ( outputStream , FALSE ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not set output: %d %s \n " , hr , getErrorString ( hr ) ) ;
return 1 ;
}
hr = voice - > Speak ( text , speakFlags , 0 ) ;
if ( FAILED ( hr ) ) {
fwprintf ( stderr , L " Could not speak: %d %s \n " , hr , getErrorString ( hr ) ) ;
return 1 ;
}
voiceToken . Release ( ) ;
outputStream - > Close ( ) ;
if ( voiceId & & voiceId [ 0 ] )
voiceToken . Release ( ) ;
return 0 ;
}
int wmain ( int argc , WCHAR * argv [ ] ) {
int wmain ( int argc , WCHAR * argv [ ] ) {
// https://stackoverflow.com/questions/2492077/output-unicode-strings-in-windows-console-app
( void ) _setmode ( _fileno ( stdout ) , _O_U8TEXT ) ;
const struct option long_options [ ] = {
{ L " help " , no_argument , 0 , L ' h ' } ,
{ L " list " , no_argument , 0 , L ' l ' } ,
{ L " output " , required_argument , 0 , L ' o ' } ,
{ L " voice " , required_argument , 0 , L ' v ' } ,
{ L " type " , required_argument , 0 , L ' t ' } ,
{ L " rate " , required_argument , 0 , L ' r ' } ,
{ L " volume " , required_argument , 0 , L ' V ' } ,
{ L " all-events " , no_argument , 0 , L ' a ' } ,
{ L " start-input-stream-event " , no_argument , 0 , L ' S ' } ,
{ L " end-input-stream-event " , no_argument , 0 , L ' E ' } ,
{ L " voice-change-event " , no_argument , 0 , L ' C ' } ,
{ L " bookmark-event " , no_argument , 0 , L ' B ' } ,
{ L " word-boundary-event " , no_argument , 0 , L ' W ' } ,
{ L " phoneme-event " , no_argument , 0 , L ' F ' } ,
{ L " sentence-boundary-event " , no_argument , 0 , L ' N ' } ,
{ L " viseme-event " , no_argument , 0 , L ' I ' } ,
{ L " audio-level-event " , no_argument , 0 , L ' L ' } ,
{ 0 , 0 , 0 , 0 } ,
{ L " help " , no_argument , 0 , L ' h ' } ,
{ L " list " , no_argument , 0 , L ' l ' } ,
{ L " output " , required_argument , 0 , L ' o ' } ,
{ L " out-type " , required_argument , 0 , L ' T ' } ,
{ L " voice " , required_argument , 0 , L ' v ' } ,
{ L " type " , required_argument , 0 , L ' t ' } ,
{ L " rate " , required_argument , 0 , L ' r ' } ,
{ L " volume " , required_argument , 0 , L ' V ' } ,
{ L " sample-rate " , required_argument , 0 , L ' s ' } ,
{ L " bits " , required_argument , 0 , L ' b ' } ,
{ L " channels " , required_argument , 0 , L ' c ' } ,
{ L " events " , required_argument , 0 , L ' e ' } ,
{ 0 , 0 , 0 , 0 } ,
} ;
int help = 0 ;
@ -337,16 +658,19 @@ int wmain(int argc, WCHAR* argv[]) {
DWORD speakFlags = 0 ;
int rate = 0 ;
int volume = 100 ;
DWORD samplesPerSec = 22050 ;
WORD bitsPerSample = 16 , nChannels = 1 ;
ULONGLONG ullEventInterest = 0 ;
DWORD outType = 0 ;
int option ;
int option_index = 0 ;
while ( 1 ) {
option = getoptW_long ( argc , argv , L " hlo: v:t:r:VaSECBWFNIL " , long_options , & option_index ) ;
option = getoptW_long ( argc , argv , L " hlo: T:v:t:r:Vs:b:c:e: " , long_options , & option_index ) ;
if ( option = = - 1 ) break ;
switch ( option ) {
switch ( option ) {
case L ' h ' :
help = 1 ;
break ;
@ -356,6 +680,18 @@ int wmain(int argc, WCHAR* argv[]) {
case L ' o ' :
wavFilename = optarg ;
break ;
case L ' T ' :
if ( ! _wcsicmp ( optarg , L " auto " ) )
outType = 0 ;
else if ( ! _wcsicmp ( optarg , L " raw " ) )
outType = 1 ;
else if ( ! _wcsicmp ( optarg , L " wav " ) )
outType = 2 ;
else if ( ! _wcsicmp ( optarg , L " ogg " ) )
outType = 3 ;
else
help = 1 ;
break ;
case L ' v ' :
voice = optarg ;
break ;
@ -377,35 +713,20 @@ int wmain(int argc, WCHAR* argv[]) {
case L ' V ' :
volume = wcstol ( optarg , 0 , 10 ) ;
break ;
case L ' a ' :
ullEventInterest = SPFEI_ALL_EVENTS ;
case L ' s ' :
samplesPerSec = wcstol ( optarg , 0 , 10 ) ;
break ;
case L ' S ' :
ullEventInterest | = SPEI_START_INPUT_STREAM ;
case L ' b ' :
bitsPerSample = ( WORD ) wcstol ( optarg , 0 , 10 ) ;
break ;
case L ' E ' :
ullEventInterest | = SPEI_END_INPUT_STREAM ;
case L ' c ' :
nChannels = ( WORD ) wcstol ( optarg , 0 , 10 ) ;
break ;
case L ' C ' :
ullEventInterest | = SPEI_VOICE_CHANGE ;
break ;
case L ' B ' :
ullEventInterest | = SPEI_TTS_BOOKMARK ;
break ;
case L ' W ' :
ullEventInterest | = SPEI_WORD_BOUNDARY ;
break ;
case L ' F ' :
ullEventInterest | = SPEI_PHONEME ;
break ;
case L ' N ' :
ullEventInterest | = SPEI_SENTENCE_BOUNDARY ;
break ;
case L ' I ' :
ullEventInterest | = SPEI_VISEME ;
break ;
case L ' L ' :
ullEventInterest | = SPEI_TTS_AUDIO_LEVEL ;
case L ' e ' :
if ( ! _wcsicmp ( optarg , L " all " ) )
ullEventInterest = 0xfffe ;
else
ullEventInterest = wcstol ( optarg , 0 , 0 ) ;
break ;
}
}
@ -419,21 +740,34 @@ int wmain(int argc, WCHAR* argv[]) {
L " Usage: %s --list | [options] <text> \n "
L " -h, --help Print this help. \n "
L " -l, --list List all voices. \n "
L " -o, --output=FILE Output WAV file. \n "
L " -o, --output=FILE Output file. Default is `output.wav` \n "
L " Use `-' for stdout. \n "
L " -T, --out-type=TYPE Output file type. Default is `auto' \n "
L " `wav' for RIFF .wav \n "
L " `ogg' for Ogg Vorbis \n "
L " `raw' for raw PCM samples \n "
L " `auto' to autodetect from file extension \n "
L " -v, --voice=VOICE Select voice. \n "
L " -t, --type=TYPE Input text type (PLAIN,SSML,SAPI,AUTO). \n "
L " -r, --rate=RATE Rate (speed) of speech, from -10 to 10. \n "
L " -V, --volume=VOL Volume of speech, from 0 to 100. \n "
L " -a, --all-events Log all events in the EVNT RIFF chunk. \n "
L " -S, --start-input-stream-event Log start input stream events. \n "
L " -E, --end-input-stream-event Log end input stream events. \n "
L " -C, --voice-change-event Log voice change events. \n "
L " -B, --bookmark-event Log bookmark events. \n "
L " -W, --word-boundary-event Log word boundary events. \n "
L " -F, --phoneme-event Log phoneme events. \n "
L " -N, --sentence-boundary-event Log sentence boundary events. \n "
L " -I, --viseme-event Log viseme events. \n "
L " -L, --audio-level-event Log audio level events. \n " ,
L " By default, events are logged into the \n "
L " output stream if it is a .wav or an .ogg \n "
L " -s, --sample-rate=HZ Sample rate of output. Default 22050. \n "
L " -b, --bits=BITS Bit depth of output. Default 16. \n "
L " -c, --channels=CHANNELS Number of audio channels of output. Default 1. \n "
L " -e, --events=MASK Select events that are output. \n "
L " Possible values: \n "
L " Stream start 2 \n "
L " Stream end 4 \n "
L " Voice change 8 \n "
L " Bookmark 16 \n "
L " Word boundary 32 \n "
L " Phoneme 64 \n "
L " Sentence boundary 128 \n "
L " Viseme 256 \n "
L " Audio level 512 \n "
L " All TTS events 65534 or `all' \n " ,
argv [ 0 ]
) ;
return 1 ;
@ -449,7 +783,7 @@ int wmain(int argc, WCHAR* argv[]) {
if ( list ) {
ret = listVoices ( ) ;
} else {
ret = speakToWav ( argv [ optind ] , voice , wavFilename , rate, volume , speakFlag s, ullEventInterest ) ;
ret = speakToWav ( argv [ optind ] , voice , wavFilename , outType, rate, volume , speakFlag s, samplesPerSec , bitsPerSample , nChannel s, ullEventInterest ) ;
}
: : CoUninitialize ( ) ;