音频采集 via Media Foundation

这里所说的音频采集是指通过麦克风采集声音数据然后经过编码保存为磁盘上的一个文件。
Windows 上有如下几种常见的实现方式:

Media Foundation 简介

Media Foundation (简称 MF)是微软在 Windows Vista上 推出的新一代多媒体应用库,目的是提供 Windows 平台一个统一的多媒体影音解决方案,开发者可以通过 MF 播放视频或声音文件、进行多媒体文件格式转码,或者将一连串图片编码为视频等等。
MF 是 DirectShow 为主的旧式多媒体应用程序接口的替代者与继承者,在微软的计划下将逐步汰换 DirectShow 技术。MF 要求 Windows Vista 或更高版本,不支持较早期的 Windows 版本,特别是 Windows XP。
MF 长于高质量的音频和视频播放,高清内容(如 HDTV,高清电视)和数字版权管理(DRM)访问控制。MF 在不同的 Windows 版本上能力不同,如 Windows 7 上就添加了 h.264 编码支持。Windows 8 上则提供数种更高质量的设置。

Media Foundation 采集音频

采集流程图

音频采集 via Media Foundation

采集代码概览

以下是整个 MF 采集过程的概要代码,略去设备枚举和 CMFCapture 类的实现。

hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED | COINIT_DISABLE_OLE1DDE);
hr = MFStartup(MF_VERSION);

// Enumerate the capture devices.
hr = UpdateDeviceList(hDlg, true);

EncodingParameters audEncParam;
audEncParam.subType = _getSubType(hDlg, true);
audEncParam.bitRate = TARGET_AUD_BIT_RATE;

CComPtr<IMFActivate> pAudActivate = NULL;
hr = GetSelectedDevice(hDlg, &pAudActivate, true);

hr = CMFCapture::CreateInstance(hDlg, &g_pCapture);
hr = g_pCapture->startCapture(pAudActivate, &audEncParam, pszFile);

// Capturing ...  

hr = g_pCapture->stopCapture();

g_pCapture->Release();
MFShutdown();
CoUninitialize();

MMDeviceHelper::enumAudCapDevices 函数

此处使用了 _enumMFDevices 传参的形式获取音频设备,因为该函数还可以枚举视频设备。

HRESULT MMDeviceHelper::enumAudCapDevices()
{
    return _enumDevices(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID);
}

HRESULT MMDeviceHelper::_enumMFDevices(const GUID& devType)
{
    HRESULT hr = S_OK;
    CComPtr<IMFAttributes> pAttributes = NULL;
    clear();
    
    // Initialize an attribute store. We will use this to specify the enumeration parameters.
    hr = MFCreateAttributes(&pAttributes, 1);
    RETURN_IF_FAILED(hr);
    
    hr = pAttributes->SetGUID(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, devType);
    RETURN_IF_FAILED(hr);
    
    hr = MFEnumDeviceSources(pAttributes, &m_ppDevices, &m_cDevices);
    RETURN_IF_FAILED(hr);
    
    return hr;
}

CMFCapture::startCapture 函数

HRESULT CMFCapture::startCapture(IMFActivate *pAudioAct, EncodingParameters* pAudEncParam, LPCTSTR pszFileName)
{
    HRESULT hr = S_OK;
    SyncUtil::AutoLock lock(m_critsec);
    
    hr = MFCreateSinkWriterFromURL(pszFileName, NULL, NULL, &m_pWriter);
    RETURN_IF_MF_FAILED(hr);
    
    m_bFirstSample = TRUE;
    m_llBaseTime = 0;
    
    hr = _configAudioCapture(pAudioAct, pAudEncParam);
    GOTO_LABEL_IF_FAILED(hr, OnErr);
    
    hr = m_pWriter->BeginWriting();
    GOTO_LABEL_IF_FAILED(hr, OnErr);
    
    hr = m_pAudioReader->ReadSample((DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, NULL, NULL, NULL, NULL);
    GOTO_LABEL_IF_FAILED(hr, OnErr);
    
    m_isCapturing = true;
    return hr;
OnErr:
    SAFE_RELEASE(m_pAudioReader);
    SAFE_RELEASE(m_pWriter);
    return hr;
}

CMFCapture::_configAudioCapture 函数

HRESULT CMFCapture::_configAudioCapture( IMFActivate *pActivate, EncodingParameters* pEncParam )
{
    HRESULT hr = E_FAIL;
    CComPtr<IMFMediaSource> pSource = NULL;
    CComPtr<IMFMediaType> pType = NULL;
    DWORD sinkStream = 0;
    SyncUtil::AutoLock lock(m_critsec);
    
    hr = pActivate->ActivateObject(__uuidof(IMFMediaSource), (void**)&pSource);
    RETURN_IF_FAILED(hr);
    
    hr = createSrcReader(pSource, m_pAudioReader, this);
    RETURN_IF_FAILED(hr);
    
    hr = configSrcReader(m_pAudioReader, true);
    RETURN_IF_FAILED(hr);
    
    hr = m_pAudioReader->GetCurrentMediaType((DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, &pType);
    RETURN_IF_FAILED(hr);
    
    hr = configAudioEncoder(pType, pEncParam, pType, m_pWriter, &sinkStream);
    RETURN_IF_FAILED(hr);    
    m_audioStreamIdx = sinkStream;
    
    hr = m_pWriter->SetInputMediaType(sinkStream, pType, NULL);
    RETURN_IF_MF_FAILED(hr);
    
    return S_OK;
}

createSrcReader 函数

HRESULT createSrcReader(IMFMediaSource *pSource, IMFSourceReader*& pReader, IUnknown* pCallback)
{
    HRESULT hr = S_OK;
    
    CComPtr<IMFAttributes> pAttributes = NULL;
    hr = MFCreateAttributes(&pAttributes, 2);
    RETURN_IF_FAILED(hr);
    
    hr = pAttributes->SetUnknown(MF_SOURCE_READER_ASYNC_CALLBACK, pCallback);
    RETURN_IF_FAILED(hr);
    
    /*
    By default, when the application releases the source reader, 
    the source reader shuts down the media source by calling IMFMediaSource::Shutdown on the media source. 
    At that point, the application can no longer use the media source.
    However, if the MF_SOURCE_READER_DISCONNECT_MEDIASOURCE_ON_SHUTDOWN attribute is TRUE, 
    the source reader does not shut down the media source. 
    That means the application can still use the media source after the application releases the source reader.
    */
    hr = pAttributes->SetUINT32(MF_SOURCE_READER_DISCONNECT_MEDIASOURCE_ON_SHUTDOWN, TRUE);
    RETURN_IF_FAILED(hr);
    
    hr = MFCreateSourceReaderFromMediaSource(pSource, pAttributes, &pReader);
    RETURN_IF_MF_FAILED(hr);
    
    return hr;
}

configSrcReader 函数

HRESULT configSrcReader(IMFSourceReader *pReader)
{
    // The list of acceptable types.
    std::vector<GUID> subTypes;
    subTypes.push_back(MFAudioFormat_Float);
    subTypes.push_back(MFAudioFormat_PCM);

    HRESULT hr = S_OK;
    BOOL bUseNativeType = FALSE;
    GUID subType = { 0 };
    CComPtr<IMFMediaType> pType = NULL;

    // If the source's native format matches any of the formats in the list, prefer the native format.

    // Note: The source might support multiple output formats. 
    // The application could provide a list to the user and have the user select the output format.
    // That is outside the scope of this sample, however.

    for (int i = 0; ; ++i) {
        hr = pReader->GetNativeMediaType(MF_SOURCE_READER_FIRST_AUDIO_STREAM, i, &pType);
        if (FAILED(hr))
            break;

        hr = pType->GetGUID(MF_MT_SUBTYPE, &subType);
        DL_MEDIA_TYPE(subType);
        if (FAILED(hr))
            break;

        for (UINT32 i = 0; i < subTypes.size(); i++) {
            if (subType == subTypes[i]) {
                hr = pReader->SetCurrentMediaType(MF_SOURCE_READER_FIRST_AUDIO_STREAM, NULL, pType);
                if (SUCCEEDED(hr)) {
                    bUseNativeType = TRUE;
                    break;
                }
            }
        }

        if (bUseNativeType)
            break;
        else
            pType = NULL;
    }

    if (!bUseNativeType) {
        // None of the native types worked. The source might offer output a compressed type.
        // Try adding a decoder.
        if (pType == NULL) {
            hr = pReader->GetNativeMediaType(MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, &pType);
            RETURN_IF_FAILED(hr);
        }

        for (UINT32 i = 0; i < subTypes.size(); i++) {
            hr = pType->SetGUID(MF_MT_SUBTYPE, subTypes[i]);
            RETURN_IF_FAILED(hr);

            hr = pReader->SetCurrentMediaType(MF_SOURCE_READER_FIRST_AUDIO_STREAM, NULL, pType);
            if (SUCCEEDED(hr))
                break;
        }
    }

    return hr;
}

configAudioEncoder 函数

HRESULT configAudioEncoder(IMFMediaType* pSrcType, EncodingParameters* params, IMFSinkWriter *pWriter, DWORD *pStreamIdx)
{
    RETURN_IF_NULL(pSrcType);
    RETURN_IF_NULL(params);
    RETURN_IF_NULL(pWriter);
    RETURN_IF_NULL(pStreamIdx);
    HRESULT hr = S_OK;
    CComPtr<IMFMediaType> pTargetType = NULL;

    hr = makeTargetAudioType(pSrcType, params->subType, &pTargetType);
    RETURN_IF_FAILED(hr);

    // some encoder might return error if setting an invalid bitRate.
    hr = pTargetType->SetUINT32(MF_MT_AVG_BITRATE, params->bitRate);
    RETURN_IF_FAILED(hr);

    hr = pWriter->AddStream(pTargetType, pStreamIdx);
    RETURN_IF_FAILED(hr);

    return hr;
}
makeTargetAudioType 函数
HRESULT makeTargetAudioType( IMFMediaType* pInputType, GUID& targetSubType, IMFMediaType** ppTargetType )
{
    RETURN_IF_NULL(pInputType);
    RETURN_IF_NULL(ppTargetType);
    HRESULT hr = E_FAIL; 
    CComPtr<IMFMediaType> spBestMatchType;

    CComPtr<IMFCollection> spTypeCollection;
    hr = ::MFTranscodeGetAudioOutputAvailableTypes( targetSubType, MFT_ENUM_FLAG_ALL, NULL, &spTypeCollection );
    RETURN_IF_FAILED(hr);

    DWORD cTypes;
    hr = spTypeCollection->GetElementCount(&cTypes);
    RETURN_IF_FAILED(hr);

    for (DWORD i = 0; i < cTypes; i++) {
        CComPtr<IUnknown> spTypeUnk;
        CComPtr<IMFMediaType> spType;

        hr = spTypeCollection->GetElement(i, &spTypeUnk);
        hr = spTypeUnk->QueryInterface( IID_PPV_ARGS(&spType) );

        if (NULL == spBestMatchType || isBetterAudioTypeMatch(pInputType, spType, spBestMatchType)) {
            spBestMatchType = spType;
        }
    }
    RETURN_IF_NULL(spBestMatchType);

    *ppTargetType = spBestMatchType.Detach();
    return S_OK;
}

IMFSourceReaderCallback::OnReadSample 回调函数

HRESULT CMFCapture::OnReadSample(HRESULT hrStatus, DWORD, DWORD, LONGLONG llTimeStamp, IMFSample *pSample)
{
    if (!isCapturing())
        return S_OK;
        
    SyncUtil::AutoLock lock(m_critsec);
    if (NULL == m_pWriter)
        return S_OK;
        
    HRESULT hr = S_OK;
    RETURN_IF_FAILED(hrStatus);
    
    if (NULL != pSample) {
        if (m_bFirstSample) {
            m_llBaseTime = llTimeStamp;
            m_bFirstSample = FALSE;
        }
        
        llTimeStamp -= m_llBaseTime;
        hr = pSample->SetSampleTime(llTimeStamp);
        RETURN_IF_FAILED(hr);
        
        hr = m_pWriter->WriteSample(m_audioStreamIdx, pSample);
        RETURN_IF_FAILED(hr);
    }
    
    hr = m_pAudioReader->ReadSample((DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, NULL, NULL, NULL, NULL);
    RETURN_IF_FAILED(hr);
    
    return S_OK;
}

CMFCapture::stopCapture 函数

在 Flush Source Reader 的时候,如果不等待 IMFSourceReaderCallback::OnFlush 回调完成,可能会造成死锁。

HRESULT CMFCapture::stopCapture()
{ 
    HRESULT hr = S_OK;
    m_isCapturing = false;
    SyncUtil::AutoLock lock(m_critsec);
    
    if (NULL != m_pWriter) {
        m_pWriter->Flush(m_audioStreamIdx);
        m_pWriter->Flush(m_videoStreamIdx);
        
        hr = m_pWriter->Finalize();
        SAFE_RELEASE(m_pWriter);
    }
    
    if (NULL != m_pAudioReader) {
        m_pAudioReader->Flush(MF_SOURCE_READER_FIRST_AUDIO_STREAM);
        WaitForSingleObject(m_hFlushedEvent, 3000);
        SAFE_RELEASE(m_pAudioReader);
    }
    
    return hr;
}

TDMETHODIMP OnFlush(DWORD dwStreamIndex)
{
    SetEvent(m_hFlushedEvent);
    return S_OK;
}

Tools

TopoEdit

TopoEdit 是用于构建和测试 MF Topology 的可视化工具,包含在 Windows SDK 7.x 中(Bin/TopoEdit.exe)。使用 TopoEdit,可以:

  • 通过添加各种 Topology Node(如 source, transform 和 output node)来构建 Topology
  • 连接并决定 topology 结构
  • 通过回放测试 topology 的功能
    音频采集 via Media Foundation

MFTrace

MFTrace 工具是 Windows 7 SDK 的一部分(Bin/MFTrace.exe)。 MFTrace 可以 hook 正在运行的 MF 应用程序并从中接收详细的跟踪信息。 从 traces 中可以看到 topology 的完整连接,以及找出程序执行过程中出错的地方。
音频采集 via Media Foundation

TextAnalysisTool

TextAnalysisTool.NET 工具可以加载日志文件,允许输入字符串模式以过滤掉或突出显示 log 信息。
音频采集 via Media Foundation

音频采集 via Media Foundation
EOF