wine/dlls/sapi/tts.c

/*
 * Speech API (SAPI) text-to-speech implementation.
 *
 * Copyright 2019 Jactry Zeng for CodeWeavers
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
 */

#include <stdarg.h>

#define COBJMACROS

#include "windef.h"
#include "winbase.h"
#include "objbase.h"

#include "sapiddk.h"
#include "sperror.h"

#include "wine/debug.h"

#include "sapi_private.h"

WINE_DEFAULT_DEBUG_CHANNEL(sapi);

struct speech_voice
{
    ISpeechVoice ISpeechVoice_iface;
    ISpVoice ISpVoice_iface;
    IConnectionPointContainer IConnectionPointContainer_iface;
    LONG ref;

    ISpStreamFormat *output;
    ISpTTSEngine *engine;
    LONG cur_stream_num;
    DWORD actions;
    USHORT volume;
    LONG rate;
    struct async_queue queue;
    CRITICAL_SECTION cs;
};

static inline struct speech_voice *impl_from_ISpeechVoice(ISpeechVoice *iface)
{
    return CONTAINING_RECORD(iface, struct speech_voice, ISpeechVoice_iface);
}

static inline struct speech_voice *impl_from_ISpVoice(ISpVoice *iface)
{
    return CONTAINING_RECORD(iface, struct speech_voice, ISpVoice_iface);
}

static inline struct speech_voice *impl_from_IConnectionPointContainer(IConnectionPointContainer *iface)
{
    return CONTAINING_RECORD(iface, struct speech_voice, IConnectionPointContainer_iface);
}

struct tts_engine_site
{
    ISpTTSEngineSite ISpTTSEngineSite_iface;
    LONG ref;

    struct speech_voice *voice;
    ULONG stream_num;
};

static inline struct tts_engine_site *impl_from_ISpTTSEngineSite(ISpTTSEngineSite *iface)
{
    return CONTAINING_RECORD(iface, struct tts_engine_site, ISpTTSEngineSite_iface);
}

static HRESULT create_default_token(const WCHAR *cat_id, ISpObjectToken **token)
{
    ISpObjectTokenCategory *cat;
    WCHAR *default_token_id = NULL;
    HRESULT hr;

    TRACE("(%s, %p).\n", debugstr_w(cat_id), token);

    if (FAILED(hr = CoCreateInstance(&CLSID_SpObjectTokenCategory, NULL, CLSCTX_INPROC_SERVER,
                                     &IID_ISpObjectTokenCategory, (void **)&cat)))
        return hr;

    if (FAILED(hr = ISpObjectTokenCategory_SetId(cat, cat_id, FALSE)) ||
        FAILED(hr = ISpObjectTokenCategory_GetDefaultTokenId(cat, &default_token_id)))
    {
        ISpObjectTokenCategory_Release(cat);
        return hr;
    }
    ISpObjectTokenCategory_Release(cat);

    if (FAILED(hr = CoCreateInstance(&CLSID_SpObjectToken, NULL, CLSCTX_INPROC_SERVER,
                                     &IID_ISpObjectToken, (void **)token)))
        goto done;

    if (FAILED(hr = ISpObjectToken_SetId(*token, NULL, default_token_id, FALSE)))
    {
        ISpObjectToken_Release(*token);
        *token = NULL;
    }

done:
    CoTaskMemFree(default_token_id);
    return hr;
}

/* ISpeechVoice interface */
static HRESULT WINAPI speech_voice_QueryInterface(ISpeechVoice *iface, REFIID iid, void **obj)
{
    struct speech_voice *This = impl_from_ISpeechVoice(iface);

    TRACE("(%p, %s %p).\n", iface, debugstr_guid(iid), obj);

    if (IsEqualIID(iid, &IID_IUnknown) ||
        IsEqualIID(iid, &IID_IDispatch) ||
        IsEqualIID(iid, &IID_ISpeechVoice))
        *obj = &This->ISpeechVoice_iface;
    else if (IsEqualIID(iid, &IID_ISpVoice))
        *obj = &This->ISpVoice_iface;
    else if (IsEqualIID(iid, &IID_IConnectionPointContainer))
        *obj = &This->IConnectionPointContainer_iface;
    else
    {
        *obj = NULL;
        FIXME("interface %s not implemented.\n", debugstr_guid(iid));
        return E_NOINTERFACE;
    }

    IUnknown_AddRef((IUnknown *)*obj);
    return S_OK;
}

static ULONG WINAPI speech_voice_AddRef(ISpeechVoice *iface)
{
    struct speech_voice *This = impl_from_ISpeechVoice(iface);
    ULONG ref = InterlockedIncrement(&This->ref);

    TRACE("(%p): ref=%lu.\n", iface, ref);

    return ref;
}

static ULONG WINAPI speech_voice_Release(ISpeechVoice *iface)
{
    struct speech_voice *This = impl_from_ISpeechVoice(iface);
    ULONG ref = InterlockedDecrement(&This->ref);

    TRACE("(%p): ref=%lu.\n", iface, ref);

    if (!ref)
    {
        async_cancel_queue(&This->queue);
        if (This->output) ISpStreamFormat_Release(This->output);
        if (This->engine) ISpTTSEngine_Release(This->engine);
        DeleteCriticalSection(&This->cs);

        heap_free(This);
    }

    return ref;
}

static HRESULT WINAPI speech_voice_GetTypeInfoCount(ISpeechVoice *iface, UINT *info)
{
    FIXME("(%p, %p): stub.\n", iface, info);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_GetTypeInfo(ISpeechVoice *iface, UINT info, LCID lcid,
                                               ITypeInfo **type_info)
{
    FIXME("(%p, %u, %lu, %p): stub.\n", iface, info, lcid, type_info);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_GetIDsOfNames(ISpeechVoice *iface, REFIID riid, LPOLESTR *names,
                                                 UINT count, LCID lcid, DISPID *dispid)
{
    FIXME("(%p, %s, %p, %u, %lu, %p): stub.\n", iface, debugstr_guid(riid), names, count, lcid, dispid);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_Invoke(ISpeechVoice *iface, DISPID dispid, REFIID riid, LCID lcid,
                                          WORD flags, DISPPARAMS *params, VARIANT *result,
                                          EXCEPINFO *excepinfo, UINT *argerr)
{
    FIXME("(%p, %ld, %s, %#lx, %#x, %p, %p, %p, %p): stub.\n", iface, dispid, debugstr_guid(riid),
          lcid, flags, params, result, excepinfo, argerr);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_Status(ISpeechVoice *iface, ISpeechVoiceStatus **status)
{
    FIXME("(%p, %p): stub.\n", iface, status);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_Voice(ISpeechVoice *iface, ISpeechObjectToken **voice)
{
    FIXME("(%p, %p): stub.\n", iface, voice);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_putref_Voice(ISpeechVoice *iface, ISpeechObjectToken *voice)
{
    FIXME("(%p, %p): stub.\n", iface, voice);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_AudioOutput(ISpeechVoice *iface, ISpeechObjectToken **output)
{
    FIXME("(%p, %p): stub.\n", iface, output);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_putref_AudioOutput(ISpeechVoice *iface, ISpeechObjectToken *output)
{
    FIXME("(%p, %p): stub.\n", iface, output);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_AudioOutputStream(ISpeechVoice *iface, ISpeechBaseStream **output)
{
    FIXME("(%p, %p): stub.\n", iface, output);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_putref_AudioOutputStream(ISpeechVoice *iface, ISpeechBaseStream *output)
{
    FIXME("(%p, %p): stub.\n", iface, output);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_Rate(ISpeechVoice *iface, LONG *rate)
{
    FIXME("(%p, %p): stub.\n", iface, rate);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_Rate(ISpeechVoice *iface, LONG rate)
{
    FIXME("(%p, %ld): stub.\n", iface, rate);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_Volume(ISpeechVoice *iface, LONG *volume)
{
    FIXME("(%p, %p): stub.\n", iface, volume);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_Volume(ISpeechVoice *iface, LONG volume)
{
    FIXME("(%p, %ld): stub.\n", iface, volume);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_AllowAudioOutputFormatChangesOnNextSet(ISpeechVoice *iface,
                                                                              VARIANT_BOOL allow)
{
    FIXME("(%p, %d): stub.\n", iface, allow);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_AllowAudioOutputFormatChangesOnNextSet(ISpeechVoice *iface, VARIANT_BOOL *allow)
{
    FIXME("(%p, %p): stub.\n", iface, allow);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_EventInterests(ISpeechVoice *iface, SpeechVoiceEvents *flags)
{
    FIXME("(%p, %p): stub.\n", iface, flags);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_EventInterests(ISpeechVoice *iface, SpeechVoiceEvents flags)
{
    FIXME("(%p, %#x): stub.\n", iface, flags);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_Priority(ISpeechVoice *iface, SpeechVoicePriority priority)
{
    FIXME("(%p, %d): stub.\n", iface, priority);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_Priority(ISpeechVoice *iface, SpeechVoicePriority *priority)
{
    FIXME("(%p, %p): stub.\n", iface, priority);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_AlertBoundary(ISpeechVoice *iface, SpeechVoiceEvents boundary)
{
    FIXME("(%p, %#x): stub.\n", iface, boundary);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_AlertBoundary(ISpeechVoice *iface, SpeechVoiceEvents *boundary)
{
    FIXME("(%p, %p): stub.\n", iface, boundary);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_put_SynchronousSpeakTimeout(ISpeechVoice *iface, LONG timeout)
{
    FIXME("(%p, %ld): stub.\n", iface, timeout);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_get_SynchronousSpeakTimeout(ISpeechVoice *iface, LONG *timeout)
{
    FIXME("(%p, %p): stub.\n", iface, timeout);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_Speak(ISpeechVoice *iface, BSTR text, SpeechVoiceSpeakFlags flags, LONG *number)
{
    FIXME("(%p, %s, %#x, %p): stub.\n", iface, debugstr_w(text), flags, number);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_SpeakStream(ISpeechVoice *iface, ISpeechBaseStream *stream,
                                               SpeechVoiceSpeakFlags flags, LONG *number)
{
    FIXME("(%p, %p, %#x, %p): stub.\n", iface, stream, flags, number);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_Pause(ISpeechVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_Resume(ISpeechVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_Skip(ISpeechVoice *iface, const BSTR type, LONG items, LONG *skipped)
{
    FIXME("(%p, %s, %ld, %p): stub.\n", iface, debugstr_w(type), items, skipped);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_GetVoices(ISpeechVoice *iface, BSTR required, BSTR optional,
                                             ISpeechObjectTokens **tokens)
{
    FIXME("(%p, %s, %s, %p): stub.\n", iface, debugstr_w(required), debugstr_w(optional), tokens);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_GetAudioOutputs(ISpeechVoice *iface, BSTR required, BSTR optional,
                                                   ISpeechObjectTokens **tokens)
{
    FIXME("(%p, %s, %s, %p): stub.\n", iface, debugstr_w(required), debugstr_w(optional), tokens);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_WaitUntilDone(ISpeechVoice *iface, LONG timeout, VARIANT_BOOL *done)
{
    FIXME("(%p, %ld, %p): stub.\n", iface, timeout, done);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_SpeakCompleteEvent(ISpeechVoice *iface, LONG *handle)
{
    FIXME("(%p, %p): stub.\n", iface, handle);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_IsUISupported(ISpeechVoice *iface, const BSTR typeui, const VARIANT *data,
                                                 VARIANT_BOOL *supported)
{
    FIXME("(%p, %s, %p, %p): stub.\n", iface, debugstr_w(typeui), data, supported);

    return E_NOTIMPL;
}

static HRESULT WINAPI speech_voice_DisplayUI(ISpeechVoice *iface, LONG hwnd, BSTR title,
                                             const BSTR typeui, const VARIANT *data)
{
    FIXME("(%p, %ld, %s, %s, %p): stub.\n", iface, hwnd, debugstr_w(title), debugstr_w(typeui), data);

    return E_NOTIMPL;
}

static const ISpeechVoiceVtbl speech_voice_vtbl =
{
    speech_voice_QueryInterface,
    speech_voice_AddRef,
    speech_voice_Release,
    speech_voice_GetTypeInfoCount,
    speech_voice_GetTypeInfo,
    speech_voice_GetIDsOfNames,
    speech_voice_Invoke,
    speech_voice_get_Status,
    speech_voice_get_Voice,
    speech_voice_putref_Voice,
    speech_voice_get_AudioOutput,
    speech_voice_putref_AudioOutput,
    speech_voice_get_AudioOutputStream,
    speech_voice_putref_AudioOutputStream,
    speech_voice_get_Rate,
    speech_voice_put_Rate,
    speech_voice_get_Volume,
    speech_voice_put_Volume,
    speech_voice_put_AllowAudioOutputFormatChangesOnNextSet,
    speech_voice_get_AllowAudioOutputFormatChangesOnNextSet,
    speech_voice_get_EventInterests,
    speech_voice_put_EventInterests,
    speech_voice_put_Priority,
    speech_voice_get_Priority,
    speech_voice_put_AlertBoundary,
    speech_voice_get_AlertBoundary,
    speech_voice_put_SynchronousSpeakTimeout,
    speech_voice_get_SynchronousSpeakTimeout,
    speech_voice_Speak,
    speech_voice_SpeakStream,
    speech_voice_Pause,
    speech_voice_Resume,
    speech_voice_Skip,
    speech_voice_GetVoices,
    speech_voice_GetAudioOutputs,
    speech_voice_WaitUntilDone,
    speech_voice_SpeakCompleteEvent,
    speech_voice_IsUISupported,
    speech_voice_DisplayUI,
};

/* ISpVoice interface */
static HRESULT WINAPI spvoice_QueryInterface(ISpVoice *iface, REFIID iid, void **obj)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p, %s %p).\n", iface, debugstr_guid(iid), obj);

    return ISpeechVoice_QueryInterface(&This->ISpeechVoice_iface, iid, obj);
}

static ULONG WINAPI spvoice_AddRef(ISpVoice *iface)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p).\n", iface);

    return ISpeechVoice_AddRef(&This->ISpeechVoice_iface);
}

static ULONG WINAPI spvoice_Release(ISpVoice *iface)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p).\n", iface);

    return ISpeechVoice_Release(&This->ISpeechVoice_iface);
}

static HRESULT WINAPI spvoice_SetNotifySink(ISpVoice *iface, ISpNotifySink *sink)
{
    FIXME("(%p, %p): stub.\n", iface, sink);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetNotifyWindowMessage(ISpVoice *iface, HWND hwnd, UINT msg,
                                                     WPARAM wparam, LPARAM lparam)
{
    FIXME("(%p, %p, %u, %Ix, %Ix): stub.\n", iface, hwnd, msg, wparam, lparam);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetNotifyCallbackFunction(ISpVoice *iface, SPNOTIFYCALLBACK *callback,
                                                        WPARAM wparam, LPARAM lparam)
{
    FIXME("(%p, %p, %Ix, %Ix): stub.\n", iface, callback, wparam, lparam);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetNotifyCallbackInterface(ISpVoice *iface, ISpNotifyCallback *callback,
                                                         WPARAM wparam, LPARAM lparam)
{
    FIXME("(%p, %p, %Ix, %Ix): stub.\n", iface, callback, wparam, lparam);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetNotifyWin32Event(ISpVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_WaitForNotifyEvent(ISpVoice *iface, DWORD milliseconds)
{
    FIXME("(%p, %ld): stub.\n", iface, milliseconds);

    return E_NOTIMPL;
}

static HANDLE WINAPI spvoice_GetNotifyEventHandle(ISpVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return NULL;
}

static HRESULT WINAPI spvoice_SetInterest(ISpVoice *iface, ULONGLONG event, ULONGLONG queued)
{
    FIXME("(%p, %s, %s): stub.\n", iface, wine_dbgstr_longlong(event), wine_dbgstr_longlong(queued));

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetEvents(ISpVoice *iface, ULONG count, SPEVENT *array, ULONG *fetched)
{
    FIXME("(%p, %lu, %p, %p): stub.\n", iface, count, array, fetched);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetInfo(ISpVoice *iface, SPEVENTSOURCEINFO *info)
{
    FIXME("(%p, %p): stub.\n", iface, info);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetOutput(ISpVoice *iface, IUnknown *unk, BOOL allow_format_changes)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);
    ISpStreamFormat *stream = NULL;
    ISpObjectToken *token = NULL;
    HRESULT hr;

    TRACE("(%p, %p, %d).\n", iface, unk, allow_format_changes);

    if (!allow_format_changes)
        FIXME("ignoring allow_format_changes = FALSE.\n");

    if (FAILED(hr = async_start_queue(&This->queue)))
        return hr;

    if (!unk)
    {
        /* TODO: Create the default SpAudioOut token here once SpMMAudioEnum is implemented. */
        if (FAILED(hr = CoCreateInstance(&CLSID_SpMMAudioOut, NULL, CLSCTX_INPROC_SERVER,
                                         &IID_ISpStreamFormat, (void **)&stream)))
            return hr;
    }
    else
    {
        if (FAILED(IUnknown_QueryInterface(unk, &IID_ISpStreamFormat, (void **)&stream)))
        {
            if (FAILED(IUnknown_QueryInterface(unk, &IID_ISpObjectToken, (void **)&token)))
                return E_INVALIDARG;
        }
    }

    if (!stream)
    {
        hr = ISpObjectToken_CreateInstance(token, NULL, CLSCTX_ALL, &IID_ISpStreamFormat, (void **)&stream);
        ISpObjectToken_Release(token);
        if (FAILED(hr))
            return hr;
    }

    EnterCriticalSection(&This->cs);

    if (This->output)
        ISpStreamFormat_Release(This->output);
    This->output = stream;

    LeaveCriticalSection(&This->cs);

    return S_OK;
}

static HRESULT WINAPI spvoice_GetOutputObjectToken(ISpVoice *iface, ISpObjectToken **token)
{
    FIXME("(%p, %p): stub.\n", iface, token);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetOutputStream(ISpVoice *iface, ISpStreamFormat **stream)
{
    FIXME("(%p, %p): stub.\n", iface, stream);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_Pause(ISpVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_Resume(ISpVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetVoice(ISpVoice *iface, ISpObjectToken *token)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);
    ISpTTSEngine *engine;
    HRESULT hr;

    TRACE("(%p, %p).\n", iface, token);

    if (!token)
    {
        if (FAILED(hr = create_default_token(SPCAT_VOICES, &token)))
            return hr;
    }
    else
        ISpObjectToken_AddRef(token);

    hr = ISpObjectToken_CreateInstance(token, NULL, CLSCTX_ALL, &IID_ISpTTSEngine, (void **)&engine);
    ISpObjectToken_Release(token);
    if (FAILED(hr))
        return hr;

    EnterCriticalSection(&This->cs);

    if (This->engine)
        ISpTTSEngine_Release(This->engine);
    This->engine = engine;

    LeaveCriticalSection(&This->cs);

    return S_OK;
}

static HRESULT WINAPI spvoice_GetVoice(ISpVoice *iface, ISpObjectToken **token)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);
    ISpObjectWithToken *engine_token_iface;
    HRESULT hr;

    TRACE("(%p, %p).\n", iface, token);

    if (!token)
        return E_POINTER;

    EnterCriticalSection(&This->cs);

    if (!This->engine)
    {
        LeaveCriticalSection(&This->cs);
        return create_default_token(SPCAT_VOICES, token);
    }

    if (SUCCEEDED(hr = ISpTTSEngine_QueryInterface(This->engine, &IID_ISpObjectWithToken, (void **)&engine_token_iface)))
    {
        hr = ISpObjectWithToken_GetObjectToken(engine_token_iface, token);
        ISpObjectWithToken_Release(engine_token_iface);
    }

    LeaveCriticalSection(&This->cs);

    return hr;
}

struct async_result
{
    HANDLE done;
    HRESULT hr;
};

struct speak_task
{
    struct async_task task;
    struct async_result *result;

    struct speech_voice *voice;
    SPVTEXTFRAG *frag_list;
    ISpTTSEngineSite *site;
    DWORD flags;
};

static HRESULT set_output_format(ISpStreamFormat *output, ISpTTSEngine *engine, GUID *fmtid, WAVEFORMATEX **wfx)
{
    GUID output_fmtid;
    WAVEFORMATEX *output_wfx = NULL;
    ISpAudio *audio = NULL;
    HRESULT hr;

    if (FAILED(hr = ISpStreamFormat_GetFormat(output, &output_fmtid, &output_wfx)))
        return hr;
    if (FAILED(hr = ISpTTSEngine_GetOutputFormat(engine, &output_fmtid, output_wfx, fmtid, wfx)))
        goto done;
    if (!IsEqualGUID(fmtid, &SPDFID_WaveFormatEx))
    {
        hr = E_INVALIDARG;
        goto done;
    }

    if (memcmp(output_wfx, *wfx, sizeof(WAVEFORMATEX)) ||
        memcmp(output_wfx + 1, *wfx + 1, output_wfx->cbSize))
    {
        if (FAILED(hr = ISpStreamFormat_QueryInterface(output, &IID_ISpAudio, (void **)&audio)) ||
            FAILED(hr = ISpAudio_SetFormat(audio, &SPDFID_WaveFormatEx, *wfx)))
            goto done;
    }

done:
    CoTaskMemFree(output_wfx);
    if (audio) ISpAudio_Release(audio);
    return hr;
}

static void speak_proc(struct async_task *task)
{
    struct speak_task *speak_task = (struct speak_task *)task;
    struct speech_voice *This = speak_task->voice;
    GUID fmtid;
    WAVEFORMATEX *wfx = NULL;
    ISpTTSEngine *engine = NULL;
    ISpAudio *audio = NULL;
    HRESULT hr;

    TRACE("(%p).\n", task);

    EnterCriticalSection(&This->cs);

    if (This->actions & SPVES_ABORT)
    {
        LeaveCriticalSection(&This->cs);
        hr = S_OK;
        goto done;
    }

    if (FAILED(hr = set_output_format(This->output, This->engine, &fmtid, &wfx)))
    {
        LeaveCriticalSection(&This->cs);
        ERR("failed setting output format: %#lx.\n", hr);
        goto done;
    }
    engine = This->engine;
    ISpTTSEngine_AddRef(engine);

    if (SUCCEEDED(ISpStreamFormat_QueryInterface(This->output, &IID_ISpAudio, (void **)&audio)))
        ISpAudio_SetState(audio, SPAS_RUN, 0);

    This->actions = SPVES_RATE | SPVES_VOLUME;

    LeaveCriticalSection(&This->cs);

    hr = ISpTTSEngine_Speak(engine, speak_task->flags, &fmtid, wfx, speak_task->frag_list, speak_task->site);
    if (SUCCEEDED(hr))
    {
        ISpStreamFormat_Commit(This->output, STGC_DEFAULT);
        if (audio)
            WaitForSingleObject(ISpAudio_EventHandle(audio), INFINITE);
    }
    else
        WARN("ISpTTSEngine_Speak failed: %#lx.\n", hr);

done:
    if (audio)
    {
        ISpAudio_SetState(audio, SPAS_CLOSED, 0);
        ISpAudio_Release(audio);
    }
    CoTaskMemFree(wfx);
    if (engine) ISpTTSEngine_Release(engine);
    heap_free(speak_task->frag_list);
    ISpTTSEngineSite_Release(speak_task->site);

    if (speak_task->result)
    {
        speak_task->result->hr = hr;
        SetEvent(speak_task->result->done);
    }
}

static HRESULT ttsenginesite_create(struct speech_voice *voice, ULONG stream_num, ISpTTSEngineSite **site);

static HRESULT WINAPI spvoice_Speak(ISpVoice *iface, const WCHAR *contents, DWORD flags, ULONG *stream_num_out)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);
    ISpTTSEngineSite *site = NULL;
    SPVTEXTFRAG *frag;
    struct speak_task *speak_task = NULL;
    struct async_result *result = NULL;
    size_t contents_len, contents_size;
    ULONG stream_num;
    HRESULT hr;

    TRACE("(%p, %p, %#lx, %p).\n", iface, contents, flags, stream_num_out);

    flags &= ~SPF_IS_NOT_XML;
    if (flags & ~(SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_NLP_SPEAK_PUNC))
    {
        FIXME("flags %#lx not implemented.\n", flags & ~(SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_NLP_SPEAK_PUNC));
        return E_NOTIMPL;
    }

    if (flags & SPF_PURGEBEFORESPEAK)
    {
        ISpAudio *audio;

        EnterCriticalSection(&This->cs);

        This->actions = SPVES_ABORT;
        if (This->output && SUCCEEDED(ISpStreamFormat_QueryInterface(This->output, &IID_ISpAudio, (void **)&audio)))
        {
            ISpAudio_SetState(audio, SPAS_CLOSED, 0);
            ISpAudio_Release(audio);
        }

        LeaveCriticalSection(&This->cs);

        async_empty_queue(&This->queue);

        EnterCriticalSection(&This->cs);
        This->actions = SPVES_CONTINUE;
        LeaveCriticalSection(&This->cs);

        if (!contents || !*contents)
            return S_OK;
    }
    else if (!contents)
        return E_POINTER;

    contents_len = wcslen(contents);
    contents_size = sizeof(WCHAR) * (contents_len + 1);

    if (!This->output)
    {
        /* Create a new output stream with the default output. */
        if (FAILED(hr = ISpVoice_SetOutput(iface, NULL, TRUE)))
            return hr;
    }

    if (!This->engine)
    {
        /* Create a new engine with the default voice. */
        if (FAILED(hr = ISpVoice_SetVoice(iface, NULL)))
            return hr;
    }

    if (!(frag = heap_alloc(sizeof(*frag) + contents_size)))
        return E_OUTOFMEMORY;
    memset(frag, 0, sizeof(*frag));
    memcpy(frag + 1, contents, contents_size);
    frag->State.eAction = SPVA_Speak;
    frag->State.Volume  = 100;
    frag->pTextStart    = (WCHAR *)(frag + 1);
    frag->ulTextLen     = contents_len;
    frag->ulTextSrcOffset = 0;

    stream_num = InterlockedIncrement(&This->cur_stream_num);
    if (FAILED(hr = ttsenginesite_create(This, stream_num, &site)))
    {
        FIXME("Failed to create ttsenginesite: %#lx.\n", hr);
        goto fail;
    }

    speak_task = heap_alloc(sizeof(*speak_task));

    speak_task->task.proc = speak_proc;
    speak_task->result    = NULL;
    speak_task->voice     = This;
    speak_task->frag_list = frag;
    speak_task->site      = site;
    speak_task->flags     = flags & SPF_NLP_SPEAK_PUNC;

    if (!(flags & SPF_ASYNC))
    {
        if (!(result = heap_alloc(sizeof(*result))))
        {
            hr = E_OUTOFMEMORY;
            goto fail;
        }
        result->hr = E_FAIL;
        result->done = CreateEventW(NULL, FALSE, FALSE, NULL);
        speak_task->result = result;
    }

    if (FAILED(hr = async_queue_task(&This->queue, (struct async_task *)speak_task)))
    {
        WARN("Failed to queue task: %#lx.\n", hr);
        goto fail;
    }

    if (stream_num_out)
        *stream_num_out = stream_num;

    if (flags & SPF_ASYNC)
        return S_OK;
    else
    {
        WaitForSingleObject(result->done, INFINITE);
        hr = result->hr;
        CloseHandle(result->done);
        heap_free(result);
        return hr;
    }

fail:
    if (site) ISpTTSEngineSite_Release(site);
    heap_free(frag);
    heap_free(speak_task);
    if (result)
    {
        CloseHandle(result->done);
        heap_free(result);
    }
    return hr;
}

static HRESULT WINAPI spvoice_SpeakStream(ISpVoice *iface, IStream *stream, DWORD flags, ULONG *number)
{
    FIXME("(%p, %p, %#lx, %p): stub.\n", iface, stream, flags, number);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetStatus(ISpVoice *iface, SPVOICESTATUS *status, WCHAR **bookmark)
{
    FIXME("(%p, %p, %p): stub.\n", iface, status, bookmark);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_Skip(ISpVoice *iface, const WCHAR *type, LONG items, ULONG *skipped)
{
    FIXME("(%p, %s, %ld, %p): stub.\n", iface, debugstr_w(type), items, skipped);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetPriority(ISpVoice *iface, SPVPRIORITY priority)
{
    FIXME("(%p, %d): stub.\n", iface, priority);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetPriority(ISpVoice *iface, SPVPRIORITY *priority)
{
    FIXME("(%p, %p): stub.\n", iface, priority);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetAlertBoundary(ISpVoice *iface, SPEVENTENUM boundary)
{
    FIXME("(%p, %d): stub.\n", iface, boundary);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetAlertBoundary(ISpVoice *iface, SPEVENTENUM *boundary)
{
    FIXME("(%p, %p): stub.\n", iface, boundary);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_SetRate(ISpVoice *iface, LONG rate)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p, %ld).\n", iface, rate);

    EnterCriticalSection(&This->cs);
    This->rate = rate;
    This->actions |= SPVES_RATE;
    LeaveCriticalSection(&This->cs);

    return S_OK;
}

static HRESULT WINAPI spvoice_GetRate(ISpVoice *iface, LONG *rate)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p, %p).\n", iface, rate);

    EnterCriticalSection(&This->cs);
    *rate = This->rate;
    LeaveCriticalSection(&This->cs);

    return S_OK;
}

static HRESULT WINAPI spvoice_SetVolume(ISpVoice *iface, USHORT volume)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p, %d).\n", iface, volume);

    if (volume > 100)
        return E_INVALIDARG;

    EnterCriticalSection(&This->cs);
    This->volume = volume;
    This->actions |= SPVES_VOLUME;
    LeaveCriticalSection(&This->cs);

    return S_OK;
}

static HRESULT WINAPI spvoice_GetVolume(ISpVoice *iface, USHORT *volume)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);

    TRACE("(%p, %p).\n", iface, volume);

    EnterCriticalSection(&This->cs);
    *volume = This->volume;
    LeaveCriticalSection(&This->cs);

    return S_OK;
}

static HRESULT WINAPI spvoice_WaitUntilDone(ISpVoice *iface, ULONG timeout)
{
    struct speech_voice *This = impl_from_ISpVoice(iface);
    HRESULT hr;

    TRACE("(%p, %ld).\n", iface, timeout);

    hr = async_wait_queue_empty(&This->queue, timeout);

    if (hr == WAIT_OBJECT_0) return S_OK;
    else if (hr == WAIT_TIMEOUT) return S_FALSE;
    return hr;
}

static HRESULT WINAPI spvoice_SetSyncSpeakTimeout(ISpVoice *iface, ULONG timeout)
{
    FIXME("(%p, %ld): stub.\n", iface, timeout);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_GetSyncSpeakTimeout(ISpVoice *iface, ULONG *timeout)
{
    FIXME("(%p, %p): stub.\n", iface, timeout);

    return E_NOTIMPL;
}

static HANDLE WINAPI spvoice_SpeakCompleteEvent(ISpVoice *iface)
{
    FIXME("(%p): stub.\n", iface);

    return NULL;
}

static HRESULT WINAPI spvoice_IsUISupported(ISpVoice *iface, const WCHAR *type, void *extra,
                                            ULONG count, BOOL *supported)
{
    FIXME("(%p, %p, %p, %ld, %p): stub.\n", iface, type, extra, count, supported);

    return E_NOTIMPL;
}

static HRESULT WINAPI spvoice_DisplayUI(ISpVoice *iface, HWND parent, const WCHAR *title,
                                        const WCHAR *type, void *extra, ULONG count)
{
    FIXME("(%p, %p, %p, %p, %p, %ld): stub.\n", iface, parent, title, type, extra, count);

    return E_NOTIMPL;
}

static const ISpVoiceVtbl spvoice_vtbl =
{
    spvoice_QueryInterface,
    spvoice_AddRef,
    spvoice_Release,
    spvoice_SetNotifySink,
    spvoice_SetNotifyWindowMessage,
    spvoice_SetNotifyCallbackFunction,
    spvoice_SetNotifyCallbackInterface,
    spvoice_SetNotifyWin32Event,
    spvoice_WaitForNotifyEvent,
    spvoice_GetNotifyEventHandle,
    spvoice_SetInterest,
    spvoice_GetEvents,
    spvoice_GetInfo,
    spvoice_SetOutput,
    spvoice_GetOutputObjectToken,
    spvoice_GetOutputStream,
    spvoice_Pause,
    spvoice_Resume,
    spvoice_SetVoice,
    spvoice_GetVoice,
    spvoice_Speak,
    spvoice_SpeakStream,
    spvoice_GetStatus,
    spvoice_Skip,
    spvoice_SetPriority,
    spvoice_GetPriority,
    spvoice_SetAlertBoundary,
    spvoice_GetAlertBoundary,
    spvoice_SetRate,
    spvoice_GetRate,
    spvoice_SetVolume,
    spvoice_GetVolume,
    spvoice_WaitUntilDone,
    spvoice_SetSyncSpeakTimeout,
    spvoice_GetSyncSpeakTimeout,
    spvoice_SpeakCompleteEvent,
    spvoice_IsUISupported,
    spvoice_DisplayUI
};

/* ISpTTSEngineSite interface */
static HRESULT WINAPI ttsenginesite_QueryInterface(ISpTTSEngineSite *iface, REFIID iid, void **obj)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);

    TRACE("(%p, %s %p).\n", iface, debugstr_guid(iid), obj);

    if (IsEqualIID(iid, &IID_IUnknown) ||
        IsEqualIID(iid, &IID_ISpTTSEngineSite))
        *obj = &This->ISpTTSEngineSite_iface;
    else
    {
        *obj = NULL;
        FIXME("interface %s not implemented.\n", debugstr_guid(iid));
        return E_NOINTERFACE;
    }

    IUnknown_AddRef((IUnknown *)*obj);
    return S_OK;
}

static ULONG WINAPI ttsenginesite_AddRef(ISpTTSEngineSite *iface)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);
    ULONG ref = InterlockedIncrement(&This->ref);

    TRACE("(%p): ref=%lu.\n", iface, ref);

    return ref;
}

static ULONG WINAPI ttsenginesite_Release(ISpTTSEngineSite *iface)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);

    ULONG ref = InterlockedDecrement(&This->ref);

    TRACE("(%p): ref=%lu.\n", iface, ref);

    if (!ref)
    {
        if (This->voice)
            ISpeechVoice_Release(&This->voice->ISpeechVoice_iface);
        heap_free(This);
    }

    return ref;
}

static HRESULT WINAPI ttsenginesite_AddEvents(ISpTTSEngineSite *iface, const SPEVENT *events, ULONG count)
{
    FIXME("(%p, %p, %ld): stub.\n", iface, events, count);

    return S_OK;
}

static HRESULT WINAPI ttsenginesite_GetEventInterest(ISpTTSEngineSite *iface, ULONGLONG *interest)
{
    FIXME("(%p, %p): stub.\n", iface, interest);

    return E_NOTIMPL;
}

static DWORD WINAPI ttsenginesite_GetActions(ISpTTSEngineSite *iface)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);
    DWORD actions;

    TRACE("(%p).\n", iface);

    EnterCriticalSection(&This->voice->cs);
    actions = This->voice->actions;
    LeaveCriticalSection(&This->voice->cs);

    return actions;
}

static HRESULT WINAPI ttsenginesite_Write(ISpTTSEngineSite *iface, const void *buf, ULONG cb, ULONG *cb_written)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);

    TRACE("(%p, %p, %ld, %p).\n", iface, buf, cb, cb_written);

    if (!This->voice->output)
        return SPERR_UNINITIALIZED;

    return ISpStreamFormat_Write(This->voice->output, buf, cb, cb_written);
}

static HRESULT WINAPI ttsenginesite_GetRate(ISpTTSEngineSite *iface, LONG *rate)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);

    TRACE("(%p, %p).\n", iface, rate);

    EnterCriticalSection(&This->voice->cs);
    *rate = This->voice->rate;
    This->voice->actions &= ~SPVES_RATE;
    LeaveCriticalSection(&This->voice->cs);

    return S_OK;
}

static HRESULT WINAPI ttsenginesite_GetVolume(ISpTTSEngineSite *iface, USHORT *volume)
{
    struct tts_engine_site *This = impl_from_ISpTTSEngineSite(iface);

    TRACE("(%p, %p).\n", iface, volume);

    EnterCriticalSection(&This->voice->cs);
    *volume = This->voice->volume;
    This->voice->actions &= ~SPVES_VOLUME;
    LeaveCriticalSection(&This->voice->cs);

    return S_OK;
}

static HRESULT WINAPI ttsenginesite_GetSkipInfo(ISpTTSEngineSite *iface, SPVSKIPTYPE *type, LONG *skip_count)
{
    FIXME("(%p, %p, %p): stub.\n", iface, type, skip_count);

    return E_NOTIMPL;
}

static HRESULT WINAPI ttsenginesite_CompleteSkip(ISpTTSEngineSite *iface, LONG num_skipped)
{
    FIXME("(%p, %ld): stub.\n", iface, num_skipped);

    return E_NOTIMPL;
}

const static ISpTTSEngineSiteVtbl ttsenginesite_vtbl =
{
    ttsenginesite_QueryInterface,
    ttsenginesite_AddRef,
    ttsenginesite_Release,
    ttsenginesite_AddEvents,
    ttsenginesite_GetEventInterest,
    ttsenginesite_GetActions,
    ttsenginesite_Write,
    ttsenginesite_GetRate,
    ttsenginesite_GetVolume,
    ttsenginesite_GetSkipInfo,
    ttsenginesite_CompleteSkip
};

static HRESULT ttsenginesite_create(struct speech_voice *voice, ULONG stream_num, ISpTTSEngineSite **site)
{
    struct tts_engine_site *This = heap_alloc(sizeof(*This));

    if (!This) return E_OUTOFMEMORY;

    This->ISpTTSEngineSite_iface.lpVtbl = &ttsenginesite_vtbl;

    This->ref = 1;
    This->voice = voice;
    This->stream_num = stream_num;

    ISpeechVoice_AddRef(&This->voice->ISpeechVoice_iface);

    *site = &This->ISpTTSEngineSite_iface;

    return S_OK;
}

/* IConnectionPointContainer interface */
static HRESULT WINAPI container_QueryInterface(IConnectionPointContainer *iface, REFIID iid, void **obj)
{
    struct speech_voice *This = impl_from_IConnectionPointContainer(iface);

    TRACE("(%p, %s %p).\n", iface, debugstr_guid(iid), obj);

    return ISpeechVoice_QueryInterface(&This->ISpeechVoice_iface, iid, obj);
}

static ULONG WINAPI container_AddRef(IConnectionPointContainer *iface)
{
    struct speech_voice *This = impl_from_IConnectionPointContainer(iface);

    TRACE("(%p).\n", iface);

    return ISpeechVoice_AddRef(&This->ISpeechVoice_iface);
}

static ULONG WINAPI container_Release(IConnectionPointContainer *iface)
{
    struct speech_voice *This = impl_from_IConnectionPointContainer(iface);

    TRACE("(%p).\n", iface);

    return ISpeechVoice_Release(&This->ISpeechVoice_iface);
}

static HRESULT WINAPI container_EnumConnectionPoints(IConnectionPointContainer *iface,
                                                     IEnumConnectionPoints **enum_cp)
{
    FIXME("(%p, %p): stub.\n", iface, enum_cp);

    return E_NOTIMPL;
}

static HRESULT WINAPI container_FindConnectionPoint(IConnectionPointContainer *iface, REFIID riid,
                                                    IConnectionPoint **cp)
{
    FIXME("(%p, %s, %p): stub.\n", iface, debugstr_guid(riid), cp);

    return E_NOTIMPL;
}

const static IConnectionPointContainerVtbl container_vtbl =
{
    container_QueryInterface,
    container_AddRef,
    container_Release,
    container_EnumConnectionPoints,
    container_FindConnectionPoint
};

HRESULT speech_voice_create(IUnknown *outer, REFIID iid, void **obj)
{
    struct speech_voice *This = heap_alloc(sizeof(*This));
    HRESULT hr;

    if (!This) return E_OUTOFMEMORY;
    This->ISpeechVoice_iface.lpVtbl = &speech_voice_vtbl;
    This->ISpVoice_iface.lpVtbl = &spvoice_vtbl;
    This->IConnectionPointContainer_iface.lpVtbl = &container_vtbl;
    This->ref = 1;

    This->output = NULL;
    This->engine = NULL;
    This->cur_stream_num = 0;
    This->actions = SPVES_CONTINUE;
    This->volume = 100;
    This->rate = 0;
    memset(&This->queue, 0, sizeof(This->queue));

    InitializeCriticalSection(&This->cs);

    hr = ISpeechVoice_QueryInterface(&This->ISpeechVoice_iface, iid, obj);

    ISpeechVoice_Release(&This->ISpeechVoice_iface);
    return hr;
}