diff --git a/bin/toucan-render/App.cpp b/bin/toucan-render/App.cpp index 83f658e..e4c22e3 100644 --- a/bin/toucan-render/App.cpp +++ b/bin/toucan-render/App.cpp @@ -3,6 +3,7 @@ #include "App.h" +#include #include #include #include @@ -17,6 +18,7 @@ extern "C" } // extern "C" +#include #include namespace toucan @@ -42,7 +44,7 @@ namespace toucan { "444p16", OIIO::ImageSpec(0, 0, 3, OIIO::TypeDesc::BASETYPE::UINT16) } }; } - + void App::_init( const std::shared_ptr& context, std::vector& argv) @@ -98,6 +100,29 @@ namespace toucan std::vector{ "-v" }, "Print verbose output."); + _cmdLine.audioCodec = ftk::CmdLineValueOption::create( + std::vector{ "-acodec" }, + "Set the audio codec.", + "", + "pcm_s16le", + ftk::join(ffmpeg::getAudioCodecStrings(), ", ")); + _cmdLine.audioSampleRate = ftk::CmdLineValueOption::create( + std::vector{ "-arate" }, + "Set the audio sample rate.", + "", + 48000); + _cmdLine.audioChannelCount = ftk::CmdLineValueOption::create( + std::vector{ "-achannels" }, + "Set the audio channel count.", + "", + 2); + _cmdLine.audioFile = ftk::CmdLineValueOption::create( + std::vector{ "-afile" }, + "Write audio to a separate file."); + _cmdLine.noAudio = ftk::CmdLineFlagOption::create( + std::vector{ "-no_audio" }, + "Disable audio output."); + IApp::_init( context, argv, @@ -112,7 +137,12 @@ namespace toucan _cmdLine.printSize, _cmdLine.raw, _cmdLine.y4m, - _cmdLine.verbose + _cmdLine.verbose, + _cmdLine.audioCodec, + _cmdLine.audioSampleRate, + _cmdLine.audioChannelCount, + _cmdLine.audioFile, + _cmdLine.noAudio }); if (_cmdLine.output->hasValue() && _cmdLine.output->getValue() == "-") @@ -123,7 +153,7 @@ namespace toucan App::App() {} - + App::~App() { if (_swsContext) @@ -148,7 +178,7 @@ namespace toucan out->_init(context, argv); return out; } - + void App::run() { const std::filesystem::path parentPath = std::filesystem::path(getExeName()).parent_path(); @@ -165,7 +195,7 @@ namespace toucan const OTIO_NS::TimeRange& timeRange = _timelineWrapper->getTimeRange(); const OTIO_NS::RationalTime timeInc(1.0, timeRange.duration().rate()); const int frames = timeRange.duration().value(); - + // Create the image graph. _graph = std::make_shared( _context, @@ -195,9 +225,35 @@ namespace toucan return; } + // Audio settings. + const int audioSampleRate = _cmdLine.audioSampleRate->hasValue() ? + _cmdLine.audioSampleRate->getValue() : 48000; + const int audioChannelCount = _cmdLine.audioChannelCount->hasValue() ? + _cmdLine.audioChannelCount->getValue() : 2; + + // Create the audio graph. + if (!_cmdLine.noAudio->found()) + { + _audioGraph = std::make_shared( + _context, + inputPath.parent_path(), + _timelineWrapper, + audioSampleRate, + audioChannelCount); + } + // Create the image host. _host = std::make_shared(_context, getOpenFXPluginPaths(getExeName())); + // Audio codec. + ffmpeg::AudioCodec audioCodec = ffmpeg::AudioCodec::PCM_S16LE; + if (_cmdLine.audioCodec->hasValue()) + { + ffmpeg::fromString(_cmdLine.audioCodec->getValue(), audioCodec); + } + + const bool includeAudio = _audioGraph && _audioGraph->hasAudio(); + // Open the movie file. std::shared_ptr ffWrite; if (hasExtension(outputPath.extension().string(), MovieReadNode::getExtensions())) @@ -211,9 +267,26 @@ namespace toucan outputPath, OIIO::ImageSpec(imageSize.x, imageSize.y, 3), timeRange, - videoCodec); + videoCodec, + includeAudio ? audioSampleRate : 0, + includeAudio ? audioChannelCount : 0, + audioCodec); + } + + // Open the separate audio file. + std::shared_ptr audioFileWrite; + if (_cmdLine.audioFile->hasValue() && includeAudio) + { + audioFileWrite = std::make_shared( + std::filesystem::path(_cmdLine.audioFile->getValue()), + audioSampleRate, + audioChannelCount, + audioCodec); } + const int samplesPerFrame = static_cast( + std::round(static_cast(audioSampleRate) / timeRange.duration().rate())); + // Render the timeline frames. if (_cmdLine.y4m->hasValue()) { @@ -261,6 +334,21 @@ namespace toucan _writeY4mFrame(buf); } } + + // Render and write audio for this frame. + if (includeAudio) + { + const AudioBuffer audioBuf = _audioGraph->exec(time, samplesPerFrame); + + if (ffWrite) + { + ffWrite->writeAudio(audioBuf); + } + if (audioFileWrite) + { + audioFileWrite->writeAudio(audioBuf); + } + } } } @@ -484,4 +572,3 @@ namespace toucan } } } - diff --git a/bin/toucan-render/App.h b/bin/toucan-render/App.h index c73a206..d45dbbe 100644 --- a/bin/toucan-render/App.h +++ b/bin/toucan-render/App.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -31,18 +32,18 @@ namespace toucan public: ~App(); - + static std::shared_ptr create( const std::shared_ptr&, std::vector&); void run() override; - + private: void _writeRawFrame(const OIIO::ImageBuf&); void _writeY4mHeader(); void _writeY4mFrame(const OIIO::ImageBuf&); - + struct CmdLine { std::shared_ptr > input; @@ -57,11 +58,18 @@ namespace toucan std::shared_ptr > raw; std::shared_ptr > y4m; std::shared_ptr verbose; + + std::shared_ptr > audioCodec; + std::shared_ptr > audioSampleRate; + std::shared_ptr > audioChannelCount; + std::shared_ptr > audioFile; + std::shared_ptr noAudio; }; CmdLine _cmdLine; std::shared_ptr _timelineWrapper; std::shared_ptr _graph; + std::shared_ptr _audioGraph; std::shared_ptr _host; AVFrame* _avFrame = nullptr; @@ -71,4 +79,3 @@ namespace toucan SwsContext* _swsContext = nullptr; }; } - diff --git a/cmake/SuperBuild/BuildFFmpeg.cmake b/cmake/SuperBuild/BuildFFmpeg.cmake index 0ab8904..a8a9572 100644 --- a/cmake/SuperBuild/BuildFFmpeg.cmake +++ b/cmake/SuperBuild/BuildFFmpeg.cmake @@ -100,6 +100,7 @@ set(FFmpeg_CONFIGURE_ARGS if(toucan_FFmpeg_MINIMAL) list(APPEND FFmpeg_CONFIGURE_ARGS --disable-decoders + --enable-decoder=aac --enable-decoder=apv --enable-decoder=av1 --enable-decoder=flac @@ -151,6 +152,7 @@ if(toucan_FFmpeg_MINIMAL) --enable-decoder=vp9 --enable-decoder=yuv4 --disable-encoders + --enable-encoder=aac --enable-encoder=flac --enable-encoder=mjpeg --enable-encoder=mpeg2video @@ -265,6 +267,7 @@ if(toucan_FFmpeg_MINIMAL) --enable-muxer=wav --enable-muxer=yuv4mpegpipe --disable-parsers + --enable-parser=aac --enable-parser=apv --enable-parser=av1 --enable-parser=flac diff --git a/lib/toucanRender/AudioBuffer.h b/lib/toucanRender/AudioBuffer.h new file mode 100644 index 0000000..9227425 --- /dev/null +++ b/lib/toucanRender/AudioBuffer.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#pragma once + +#include +#include + +namespace toucan +{ + struct AudioBuffer + { + std::vector data; + int sampleRate = 0; + int channelCount = 0; + int sampleCount = 0; + + bool isValid() const { return !data.empty() && sampleRate > 0; } + size_t byteCount() const { return data.size() * sizeof(float); } + }; +} diff --git a/lib/toucanRender/AudioGraph.cpp b/lib/toucanRender/AudioGraph.cpp new file mode 100644 index 0000000..5d30268 --- /dev/null +++ b/lib/toucanRender/AudioGraph.cpp @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#include "AudioGraph.h" + +#include "TimelineAlgo.h" +#include "Util.h" + +#include + +#include +#include +#include +#include + +#include +#include + +namespace toucan +{ + namespace + { + const std::string logPrefix = "toucan::AudioGraph"; + } + + AudioGraph::AudioGraph( + const std::shared_ptr& context, + const std::filesystem::path& path, + const std::shared_ptr& timelineWrapper, + int sampleRate, + int channelCount) : + _context(context), + _path(path), + _timelineWrapper(timelineWrapper), + _timeRange(timelineWrapper->getTimeRange()), + _sampleRate(sampleRate), + _channelCount(channelCount) + { + _audioReadCache.setMax(20); + + for (const auto& child : _timelineWrapper->getTimeline()->tracks()->children()) + { + if (auto track = OTIO_NS::dynamic_retainer_cast(child)) + { + if (track->kind() == OTIO_NS::Track::Kind::audio && + !track->find_clips().empty()) + { + _hasAudio = true; + break; + } + } + } + + if (!_hasAudio) + { + for (const auto& child : _timelineWrapper->getTimeline()->tracks()->children()) + { + if (auto track = OTIO_NS::dynamic_retainer_cast(child)) + { + if (track->kind() == OTIO_NS::Track::Kind::video) + { + for (auto clip : track->find_clips()) + { + if (auto externalRef = dynamic_cast( + clip->media_reference())) + { + try + { + const std::string mediaPath = + _timelineWrapper->getMediaPath(externalRef->target_url()); + auto audioRead = std::make_shared( + mediaPath, _sampleRate, _channelCount); + if (audioRead->hasAudio()) + { + _hasAudio = true; + _audioReadCache.add(externalRef, audioRead); + break; + } + } + catch (const std::exception&) + { + } + } + } + if (_hasAudio) break; + } + } + } + } + } + + AudioGraph::~AudioGraph() + {} + + int AudioGraph::getSampleRate() const + { + return _sampleRate; + } + + int AudioGraph::getChannelCount() const + { + return _channelCount; + } + + bool AudioGraph::hasAudio() const + { + return _hasAudio; + } + + AudioBuffer AudioGraph::exec(const OTIO_NS::RationalTime& time, int sampleCount) + { + AudioBuffer out; + out.sampleRate = _sampleRate; + out.channelCount = _channelCount; + out.sampleCount = sampleCount; + out.data.resize(sampleCount * _channelCount, 0.F); + + auto stack = _timelineWrapper->getTimeline()->tracks(); + const auto& stackEffects = stack->effects(); + OTIO_NS::RationalTime t = time - _timeRange.start_time(); + t = _timeWarps(t, stack->available_range(), stackEffects); + + int trackCount = 0; + for (const auto& i : stack->children()) + { + if (auto track = OTIO_NS::dynamic_retainer_cast(i)) + { + bool processTrack = false; + if (track->kind() == OTIO_NS::Track::Kind::audio && + !track->find_clips().empty()) + { + processTrack = true; + } + else if (track->kind() == OTIO_NS::Track::Kind::video && + !track->find_clips().empty()) + { + processTrack = true; + } + + if (processTrack) + { + const auto& trackEffects = track->effects(); + OTIO_NS::RationalTime t2 = t; + if (!trackEffects.empty()) + { + t2 = _timeWarps(t2, track->available_range(), trackEffects); + } + + AudioBuffer trackBuf = _track(t2, sampleCount, track); + + _applyEffects(trackBuf, trackEffects); + + if (trackBuf.isValid()) + { + for (size_t j = 0; j < out.data.size() && j < trackBuf.data.size(); ++j) + { + out.data[j] += trackBuf.data[j]; + } + ++trackCount; + } + } + } + } + + for (size_t j = 0; j < out.data.size(); ++j) + { + out.data[j] = std::max(-1.F, std::min(1.F, out.data[j])); + } + + return out; + } + + AudioBuffer AudioGraph::_track( + const OTIO_NS::RationalTime& time, + int sampleCount, + const OTIO_NS::SerializableObject::Retainer& track) + { + AudioBuffer out; + out.sampleRate = _sampleRate; + out.channelCount = _channelCount; + out.sampleCount = sampleCount; + out.data.resize(sampleCount * _channelCount, 0.F); + + OTIO_NS::SerializableObject::Retainer item; + OTIO_NS::SerializableObject::Retainer prev; + OTIO_NS::SerializableObject::Retainer prev2; + OTIO_NS::SerializableObject::Retainer next; + OTIO_NS::SerializableObject::Retainer next2; + const auto& children = track->children(); + for (size_t i = 0; i < children.size(); ++i) + { + if ((item = OTIO_NS::dynamic_retainer_cast(children[i]))) + { + const auto trimmedRangeInParent = item->trimmed_range_in_parent(); + if (trimmedRangeInParent.has_value() && trimmedRangeInParent.value().contains(time)) + { + out = _item( + track->transformed_time(time, item), + sampleCount, + item); + if (i > 0) + { + prev = children[i - 1]; + } + if (i > 1) + { + prev2 = children[i - 2]; + } + if (i < (children.size() - 1)) + { + next = children[i + 1]; + } + if (children.size() > 1 && i < (children.size() - 2)) + { + next2 = children[i + 2]; + } + break; + } + } + } + + if (item) + { + if (auto prevTransition = OTIO_NS::dynamic_retainer_cast(prev)) + { + const auto trimmedRangeInParent = prevTransition->trimmed_range_in_parent(); + if (trimmedRangeInParent.has_value() && trimmedRangeInParent.value().contains(time)) + { + if (auto prevItem = OTIO_NS::dynamic_retainer_cast(prev2)) + { + const double value = + (time - trimmedRangeInParent.value().start_time()).value() / + trimmedRangeInParent.value().duration().value(); + + AudioBuffer a = _item( + track->transformed_time(time, prevItem), + sampleCount, + prevItem); + + if (a.isValid() && out.isValid()) + { + for (size_t j = 0; j < out.data.size() && j < a.data.size(); ++j) + { + out.data[j] = a.data[j] * static_cast(1.0 - value) + + out.data[j] * static_cast(value); + } + } + } + } + } + if (auto nextTransition = OTIO_NS::dynamic_retainer_cast(next)) + { + const auto trimmedRangeInParent = nextTransition->trimmed_range_in_parent(); + if (trimmedRangeInParent.has_value() && trimmedRangeInParent.value().contains(time)) + { + if (auto nextItem = OTIO_NS::dynamic_retainer_cast(next2)) + { + const double value = + (time - trimmedRangeInParent.value().start_time()).value() / + trimmedRangeInParent.value().duration().value(); + + AudioBuffer b = _item( + track->transformed_time(time, nextItem), + sampleCount, + nextItem); + + if (b.isValid() && out.isValid()) + { + for (size_t j = 0; j < out.data.size() && j < b.data.size(); ++j) + { + out.data[j] = out.data[j] * static_cast(1.0 - value) + + b.data[j] * static_cast(value); + } + } + } + } + } + } + + return out; + } + + AudioBuffer AudioGraph::_item( + const OTIO_NS::RationalTime& time, + int sampleCount, + const OTIO_NS::SerializableObject::Retainer& item) + { + AudioBuffer out; + out.sampleRate = _sampleRate; + out.channelCount = _channelCount; + out.sampleCount = sampleCount; + out.data.resize(sampleCount * _channelCount, 0.F); + + OTIO_NS::RationalTime t = time; + + const auto& effects = item->effects(); + t = _timeWarps(t, item->available_range(), effects); + + if (auto clip = OTIO_NS::dynamic_retainer_cast(item)) + { + auto mediaRef = clip->media_reference(); + if (auto externalRef = dynamic_cast(mediaRef)) + { + std::shared_ptr audioRead; + if (!_audioReadCache.get(externalRef, audioRead)) + { + try + { + const std::string mediaPath = + _timelineWrapper->getMediaPath(externalRef->target_url()); + audioRead = std::make_shared( + mediaPath, _sampleRate, _channelCount); + _audioReadCache.add(externalRef, audioRead); + } + catch (const std::exception& e) + { + _context.lock()->getSystem()->print( + logPrefix, + e.what(), + ftk::LogType::Error); + } + } + if (audioRead && audioRead->hasAudio()) + { + if (clip->available_range().start_time() != + audioRead->getTimeRange().start_time()) + { + t -= clip->available_range().start_time(); + } + + out = audioRead->getAudio(t, sampleCount); + } + } + } + else if (auto gap = OTIO_NS::dynamic_retainer_cast(item)) + { + } + + _applyEffects(out, effects); + + return out; + } + + OTIO_NS::RationalTime AudioGraph::_timeWarps( + const OTIO_NS::RationalTime& time, + const OTIO_NS::TimeRange& timeRange, + const std::vector >& effects) + { + OTIO_NS::RationalTime out = time; + for (const auto& effect : effects) + { + if (auto linearTimeWarp = dynamic_cast(effect.value)) + { + const double s = linearTimeWarp->time_scalar(); + out = OTIO_NS::RationalTime( + (out - timeRange.start_time()).value() * s, + time.rate()).round(); + } + } + return out; + } + + void AudioGraph::_applyEffects( + AudioBuffer& buffer, + const std::vector >& effects) + { + for (const auto& effect : effects) + { + if (dynamic_cast(effect.value)) + { + continue; + } + + const auto& metaData = effect->metadata(); + auto volumeIt = metaData.find("volume"); + if (volumeIt != metaData.end() && volumeIt->second.has_value()) + { + try + { + const float volume = std::any_cast(volumeIt->second); + for (size_t i = 0; i < buffer.data.size(); ++i) + { + buffer.data[i] *= volume; + } + } + catch (const std::bad_any_cast&) + { + } + } + } + } +} diff --git a/lib/toucanRender/AudioGraph.h b/lib/toucanRender/AudioGraph.h new file mode 100644 index 0000000..ade069e --- /dev/null +++ b/lib/toucanRender/AudioGraph.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace toucan +{ + class AudioGraph : public std::enable_shared_from_this + { + public: + AudioGraph( + const std::shared_ptr&, + const std::filesystem::path&, + const std::shared_ptr&, + int sampleRate, + int channelCount); + + ~AudioGraph(); + + int getSampleRate() const; + int getChannelCount() const; + bool hasAudio() const; + + AudioBuffer exec(const OTIO_NS::RationalTime& time, int sampleCount); + + private: + AudioBuffer _track( + const OTIO_NS::RationalTime&, + int sampleCount, + const OTIO_NS::SerializableObject::Retainer&); + + AudioBuffer _item( + const OTIO_NS::RationalTime&, + int sampleCount, + const OTIO_NS::SerializableObject::Retainer&); + + OTIO_NS::RationalTime _timeWarps( + const OTIO_NS::RationalTime&, + const OTIO_NS::TimeRange&, + const std::vector >&); + + void _applyEffects( + AudioBuffer&, + const std::vector >&); + + std::weak_ptr _context; + std::filesystem::path _path; + std::shared_ptr _timelineWrapper; + OTIO_NS::TimeRange _timeRange; + int _sampleRate = 48000; + int _channelCount = 2; + bool _hasAudio = false; + + ftk::LRUCache > _audioReadCache; + }; +} diff --git a/lib/toucanRender/CMakeLists.txt b/lib/toucanRender/CMakeLists.txt index 02aa75c..845efc4 100644 --- a/lib/toucanRender/CMakeLists.txt +++ b/lib/toucanRender/CMakeLists.txt @@ -1,6 +1,10 @@ set(HEADERS + AudioBuffer.h + AudioGraph.h Comp.h FFmpeg.h + FFmpegAudioRead.h + FFmpegAudioWrite.h FFmpegRead.h FFmpegWrite.h ImageEffect.h @@ -17,8 +21,11 @@ set(HEADERS Util.h) set(HEADERS_PRIVATE) set(SOURCE + AudioGraph.cpp Comp.cpp FFmpeg.cpp + FFmpegAudioRead.cpp + FFmpegAudioWrite.cpp FFmpegRead.cpp FFmpegWrite.cpp ImageEffect.cpp @@ -44,6 +51,10 @@ else() endif() add_library(toucanRender ${HEADERS} ${HEADERS_PRIVATE} ${SOURCE}) +find_library(SWRESAMPLE_LIBRARY swresample PATHS ${CMAKE_INSTALL_PREFIX}/lib NO_DEFAULT_PATH) +if(NOT SWRESAMPLE_LIBRARY) + find_library(SWRESAMPLE_LIBRARY swresample) +endif() set(LIBS_PUBLIC toucanResource ftk::ftkCore @@ -52,6 +63,9 @@ set(LIBS_PUBLIC lunasvg::lunasvg OpenImageIO::OpenImageIO MINIZIP::minizip) +if(SWRESAMPLE_LIBRARY) + list(APPEND LIBS_PUBLIC ${SWRESAMPLE_LIBRARY}) +endif() if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9) list(APPEND LIBS_PUBLIC stdc++fs) endif() diff --git a/lib/toucanRender/FFmpeg.cpp b/lib/toucanRender/FFmpeg.cpp index 927a094..b42dc88 100644 --- a/lib/toucanRender/FFmpeg.cpp +++ b/lib/toucanRender/FFmpeg.cpp @@ -71,6 +71,49 @@ namespace toucan AV_PROFILE_UNKNOWN, AV_PROFILE_AV1_MAIN }; + + std::vector > _getAudioCodecs() + { + std::vector > out; + const AVCodec* avCodec = nullptr; + void* avCodecIterate = nullptr; + while ((avCodec = av_codec_iterate(&avCodecIterate))) + { + if (av_codec_is_encoder(avCodec) && + AVMEDIA_TYPE_AUDIO == avcodec_get_type(avCodec->id)) + { + out.push_back({ avCodec->id, avCodec->name }); + } + } + return out; + } + + const std::vector audioCodecStrings = + { + "pcm_s16le", + "pcm_s24le", + "pcm_f32le", + "flac", + "aac" + }; + + const std::vector audioCodecIds = + { + AV_CODEC_ID_PCM_S16LE, + AV_CODEC_ID_PCM_S24LE, + AV_CODEC_ID_PCM_F32LE, + AV_CODEC_ID_FLAC, + AV_CODEC_ID_AAC + }; + + const std::vector audioSampleFormats = + { + AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_S32, + AV_SAMPLE_FMT_FLT, + AV_SAMPLE_FMT_S32, + AV_SAMPLE_FMT_FLTP + }; } std::vector getVideoCodecs() @@ -122,6 +165,55 @@ namespace toucan return videoCodecProfiles[static_cast(value)]; } + std::vector getAudioCodecs() + { + std::vector out; + for (const auto& i : _getAudioCodecs()) + { + for (size_t j = 0; j < audioCodecIds.size(); ++j) + { + if (i.first == audioCodecIds[j]) + { + out.push_back(static_cast(j)); + } + } + } + return out; + } + + std::vector getAudioCodecStrings() + { + std::vector out; + for (const auto& i : getAudioCodecs()) + { + out.push_back(toString(i)); + } + return out; + } + + std::string toString(AudioCodec value) + { + return audioCodecStrings[static_cast(value)]; + } + + void fromString(const std::string& s, AudioCodec& value) + { + const auto i = std::find(audioCodecStrings.begin(), audioCodecStrings.end(), s); + value = i != audioCodecStrings.end() ? + static_cast(i - audioCodecStrings.begin()) : + AudioCodec::First; + } + + AVCodecID getAudioCodecId(AudioCodec value) + { + return audioCodecIds[static_cast(value)]; + } + + AVSampleFormat getAudioSampleFormat(AudioCodec value) + { + return audioSampleFormats[static_cast(value)]; + } + std::string getErrorLabel(int r) { char buf[4096]; diff --git a/lib/toucanRender/FFmpeg.h b/lib/toucanRender/FFmpeg.h index a14413c..95ec224 100644 --- a/lib/toucanRender/FFmpeg.h +++ b/lib/toucanRender/FFmpeg.h @@ -11,6 +11,7 @@ extern "C" #include #include #include +#include } namespace toucan @@ -52,6 +53,37 @@ namespace toucan //! Get a video codec profile. int getVideoCodecProfile(VideoCodec); + //! Audio codecs. + enum class AudioCodec + { + PCM_S16LE, + PCM_S24LE, + PCM_F32LE, + FLAC, + AAC, + + Count, + First = PCM_S16LE + }; + + //! Get a list of audio codecs. + std::vector getAudioCodecs(); + + //! Get a list of audio codec strings. + std::vector getAudioCodecStrings(); + + //! Convert an audio codec to a string. + std::string toString(AudioCodec); + + //! Convert a string to an audio codec. + void fromString(const std::string&, AudioCodec&); + + //! Get an audio codec ID. + AVCodecID getAudioCodecId(AudioCodec); + + //! Get an audio sample format. + AVSampleFormat getAudioSampleFormat(AudioCodec); + //! FFmpeg log callback. void log(void*, int level, const char* fmt, va_list vl); diff --git a/lib/toucanRender/FFmpegAudioRead.cpp b/lib/toucanRender/FFmpegAudioRead.cpp new file mode 100644 index 0000000..28f27e5 --- /dev/null +++ b/lib/toucanRender/FFmpegAudioRead.cpp @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#include "FFmpegAudioRead.h" + +#include +#include +#include + +extern "C" +{ +#include +#include +} + +namespace toucan +{ + namespace ffmpeg + { + namespace + { + const size_t avIOContextBufferSize = 4096; + + class Packet + { + public: + Packet() + { + p = av_packet_alloc(); + } + + ~Packet() + { + av_packet_free(&p); + } + + AVPacket* p = nullptr; + }; + } + + AudioRead::AudioRead( + const std::filesystem::path& path, + int outputSampleRate, + int outputChannelCount, + const MemoryReference& memoryReference) : + _path(path), + _memoryReference(memoryReference), + _outputSampleRate(outputSampleRate), + _outputChannelCount(outputChannelCount) + { + av_log_set_level(AV_LOG_QUIET); + + if (memoryReference.isValid()) + { + _avFormatContext = avformat_alloc_context(); + if (!_avFormatContext) + { + throw std::runtime_error("Cannot allocate format context"); + } + + _avIOBufferData = AVIOBufferData( + reinterpret_cast(memoryReference.getData()), + memoryReference.getSize()); + _avIOContextBuffer = static_cast(av_malloc(avIOContextBufferSize)); + _avIOContext = avio_alloc_context( + _avIOContextBuffer, + avIOContextBufferSize, + 0, + &_avIOBufferData, + &_avIOBufferRead, + nullptr, + &_avIOBufferSeek); + if (!_avIOContext) + { + throw std::runtime_error("Cannot allocate I/O context"); + } + + _avFormatContext->pb = _avIOContext; + } + + const std::string fileName = path.string(); + int r = avformat_open_input( + &_avFormatContext, + !_avFormatContext ? fileName.c_str() : nullptr, + nullptr, + nullptr); + if (r < 0 || !_avFormatContext) + { + throw std::runtime_error("Cannot open file"); + } + + r = avformat_find_stream_info(_avFormatContext, nullptr); + if (r < 0) + { + throw std::runtime_error("Cannot find stream info"); + } + + for (unsigned int i = 0; i < _avFormatContext->nb_streams; ++i) + { + if (AVMEDIA_TYPE_AUDIO == _avFormatContext->streams[i]->codecpar->codec_type && + AV_DISPOSITION_DEFAULT == _avFormatContext->streams[i]->disposition) + { + _avStream = i; + break; + } + } + if (-1 == _avStream) + { + for (unsigned int i = 0; i < _avFormatContext->nb_streams; ++i) + { + if (AVMEDIA_TYPE_AUDIO == _avFormatContext->streams[i]->codecpar->codec_type) + { + _avStream = i; + break; + } + } + } + + if (_avStream != -1) + { + auto avAudioStream = _avFormatContext->streams[_avStream]; + auto avAudioCodecParameters = avAudioStream->codecpar; + auto avAudioCodec = avcodec_find_decoder(avAudioCodecParameters->codec_id); + if (!avAudioCodec) + { + throw std::runtime_error("No audio codec found"); + } + _avCodecParameters = avcodec_parameters_alloc(); + if (!_avCodecParameters) + { + throw std::runtime_error("Cannot allocate parameters"); + } + avcodec_parameters_copy(_avCodecParameters, avAudioCodecParameters); + _avCodecContext = avcodec_alloc_context3(avAudioCodec); + if (!_avCodecContext) + { + throw std::runtime_error("Cannot allocate context"); + } + avcodec_parameters_to_context(_avCodecContext, _avCodecParameters); + _avCodecContext->thread_count = 0; + _avCodecContext->thread_type = FF_THREAD_FRAME; + r = avcodec_open2(_avCodecContext, avAudioCodec, 0); + if (r < 0) + { + throw std::runtime_error("Cannot open audio stream"); + } + + _avFrame = av_frame_alloc(); + if (!_avFrame) + { + throw std::runtime_error("Cannot allocate frame"); + } + + AVChannelLayout outLayout; + av_channel_layout_default(&outLayout, _outputChannelCount); + + r = swr_alloc_set_opts2( + &_swrContext, + &outLayout, + AV_SAMPLE_FMT_FLT, + _outputSampleRate, + &_avCodecParameters->ch_layout, + static_cast(_avCodecParameters->format), + _avCodecParameters->sample_rate, + 0, + nullptr); + if (r < 0 || !_swrContext) + { + throw std::runtime_error("Cannot allocate resampler context"); + } + r = swr_init(_swrContext); + if (r < 0) + { + throw std::runtime_error("Cannot initialize resampler"); + } + + double duration = 0.0; + if (avAudioStream->duration != AV_NOPTS_VALUE) + { + duration = av_q2d(avAudioStream->time_base) * avAudioStream->duration; + } + else if (_avFormatContext->duration != AV_NOPTS_VALUE) + { + duration = static_cast(_avFormatContext->duration) / AV_TIME_BASE; + } + + const double rate = _avCodecParameters->sample_rate; + const int64_t totalSamples = static_cast(std::round(duration * rate)); + _timeRange = OTIO_NS::TimeRange( + OTIO_NS::RationalTime(0.0, rate), + OTIO_NS::RationalTime(totalSamples, rate)); + _currentTime = OTIO_NS::RationalTime(0.0, rate); + } + } + + AudioRead::~AudioRead() + { + if (_swrContext) + { + swr_free(&_swrContext); + } + if (_avFrame) + { + av_frame_free(&_avFrame); + } + if (_avCodecContext) + { + avcodec_free_context(&_avCodecContext); + } + if (_avCodecParameters) + { + avcodec_parameters_free(&_avCodecParameters); + } + if (_avIOContext) + { + avio_context_free(&_avIOContext); + } + if (_avFormatContext) + { + avformat_close_input(&_avFormatContext); + } + } + + bool AudioRead::hasAudio() const + { + return _avStream != -1; + } + + int AudioRead::getSampleRate() const + { + return _outputSampleRate; + } + + int AudioRead::getChannelCount() const + { + return _outputChannelCount; + } + + const OTIO_NS::TimeRange& AudioRead::getTimeRange() const + { + return _timeRange; + } + + AudioBuffer AudioRead::getAudio( + const OTIO_NS::RationalTime& time, + int sampleCount) + { + AudioBuffer out; + out.sampleRate = _outputSampleRate; + out.channelCount = _outputChannelCount; + out.sampleCount = sampleCount; + + if (_avStream == -1) + { + out.data.resize(sampleCount * _outputChannelCount, 0.F); + return out; + } + + const OTIO_NS::RationalTime normalizedTime = + time.rescaled_to(_timeRange.duration().rate()); + + if (normalizedTime != _currentTime) + { + _seek(normalizedTime); + } + + std::vector samples; + samples.reserve(sampleCount * _outputChannelCount); + + if (!_residual.empty()) + { + const int residualSamples = static_cast( + _residual.size()) / _outputChannelCount; + if (residualSamples >= sampleCount) + { + const int needed = sampleCount * _outputChannelCount; + samples.insert(samples.end(), _residual.begin(), _residual.begin() + needed); + _residual.erase(_residual.begin(), _residual.begin() + needed); + out.data = std::move(samples); + _currentTime = OTIO_NS::RationalTime( + _currentTime.value() + + static_cast(sampleCount) * + _timeRange.duration().rate() / _outputSampleRate, + _timeRange.duration().rate()); + return out; + } + samples.insert(samples.end(), _residual.begin(), _residual.end()); + _residual.clear(); + } + + const int remaining = sampleCount - + static_cast(samples.size()) / _outputChannelCount; + _decode(samples, remaining); + + const int totalSamples = static_cast(samples.size()) / _outputChannelCount; + if (totalSamples > sampleCount) + { + const int needed = sampleCount * _outputChannelCount; + _residual.assign(samples.begin() + needed, samples.end()); + samples.resize(needed); + } + else if (totalSamples < sampleCount) + { + samples.resize(sampleCount * _outputChannelCount, 0.F); + } + + out.data = std::move(samples); + _currentTime = OTIO_NS::RationalTime( + _currentTime.value() + + static_cast(sampleCount) * + _timeRange.duration().rate() / _outputSampleRate, + _timeRange.duration().rate()); + return out; + } + + void AudioRead::_seek(const OTIO_NS::RationalTime& time) + { + if (_avStream != -1) + { + avcodec_flush_buffers(_avCodecContext); + swr_close(_swrContext); + swr_init(_swrContext); + + const double seconds = + time.to_seconds() - _timeRange.start_time().to_seconds(); + const int64_t samplePos = static_cast( + seconds * _avCodecParameters->sample_rate); + const int64_t timestamp = av_rescale_q( + samplePos, + { 1, _avCodecParameters->sample_rate }, + _avFormatContext->streams[_avStream]->time_base); + av_seek_frame( + _avFormatContext, + _avStream, + timestamp, + AVSEEK_FLAG_BACKWARD); + _currentTime = time; + _residual.clear(); + } + _eof = false; + } + + void AudioRead::_decode(std::vector& output, int sampleCount) + { + if (_avStream == -1) return; + + const int initialSize = static_cast(output.size()); + const int totalNeeded = initialSize + sampleCount * _outputChannelCount; + + Packet packet; + int decoding = 0; + while (0 == decoding && static_cast(output.size()) < totalNeeded) + { + if (!_eof) + { + decoding = av_read_frame(_avFormatContext, packet.p); + if (AVERROR_EOF == decoding) + { + _eof = true; + decoding = 0; + } + else if (decoding < 0) + { + break; + } + } + if ((_eof && _avStream != -1) || (_avStream == packet.p->stream_index)) + { + decoding = avcodec_send_packet( + _avCodecContext, + _eof ? nullptr : packet.p); + if (AVERROR_EOF == decoding) + { + decoding = 0; + } + else if (decoding < 0) + { + break; + } + + while (0 == decoding) + { + decoding = avcodec_receive_frame(_avCodecContext, _avFrame); + if (decoding < 0) + { + break; + } + + const int maxOutputSamples = swr_get_out_samples( + _swrContext, _avFrame->nb_samples); + if (maxOutputSamples <= 0) continue; + + std::vector converted(maxOutputSamples * _outputChannelCount); + uint8_t* outBuf = reinterpret_cast(converted.data()); + + const int convertedSamples = swr_convert( + _swrContext, + &outBuf, + maxOutputSamples, + const_cast(_avFrame->extended_data), + _avFrame->nb_samples); + + if (convertedSamples > 0) + { + output.insert( + output.end(), + converted.begin(), + converted.begin() + convertedSamples * _outputChannelCount); + } + + if (static_cast(output.size()) >= totalNeeded) + { + decoding = 1; + break; + } + } + + if (AVERROR(EAGAIN) == decoding) + { + decoding = 0; + } + else if (AVERROR_EOF == decoding) + { + break; + } + else if (decoding < 0) + { + break; + } + else if (1 == decoding) + { + break; + } + } + if (packet.p->buf) + { + av_packet_unref(packet.p); + } + } + if (packet.p->buf) + { + av_packet_unref(packet.p); + } + } + + AudioRead::AVIOBufferData::AVIOBufferData() + { + } + + AudioRead::AVIOBufferData::AVIOBufferData(const uint8_t* data, size_t size) : + data(data), + size(size) + { + } + + int AudioRead::_avIOBufferRead(void* opaque, uint8_t* buf, int bufSize) + { + AVIOBufferData* bufferData = static_cast(opaque); + + const int64_t remaining = bufferData->size - bufferData->offset; + int bufSizeClamped = std::min(std::max( + static_cast(bufSize), + static_cast(0)), + remaining); + if (!bufSizeClamped) + { + return AVERROR_EOF; + } + + memcpy(buf, bufferData->data + bufferData->offset, bufSizeClamped); + bufferData->offset += bufSizeClamped; + + return bufSizeClamped; + } + + int64_t AudioRead::_avIOBufferSeek(void* opaque, int64_t offset, int whence) + { + AVIOBufferData* bufferData = static_cast(opaque); + + if (whence & AVSEEK_SIZE) + { + return bufferData->size; + } + + bufferData->offset = std::min(std::max( + offset, + static_cast(0)), + static_cast(bufferData->size)); + + return offset; + } + } +} diff --git a/lib/toucanRender/FFmpegAudioRead.h b/lib/toucanRender/FFmpegAudioRead.h new file mode 100644 index 0000000..0ca77f6 --- /dev/null +++ b/lib/toucanRender/FFmpegAudioRead.h @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#pragma once + +#include +#include +#include + +#include + +extern "C" +{ +#include +#include +#include + +} // extern "C" + +#include + +namespace toucan +{ + namespace ffmpeg + { + class AudioRead : public std::enable_shared_from_this + { + public: + AudioRead( + const std::filesystem::path&, + int outputSampleRate, + int outputChannelCount, + const MemoryReference& = {}); + + virtual ~AudioRead(); + + bool hasAudio() const; + int getSampleRate() const; + int getChannelCount() const; + const OTIO_NS::TimeRange& getTimeRange() const; + + AudioBuffer getAudio( + const OTIO_NS::RationalTime& time, + int sampleCount); + + private: + void _seek(const OTIO_NS::RationalTime&); + void _decode(std::vector& output, int sampleCount); + + std::filesystem::path _path; + MemoryReference _memoryReference; + int _outputSampleRate = 0; + int _outputChannelCount = 0; + OTIO_NS::TimeRange _timeRange; + OTIO_NS::RationalTime _currentTime; + + struct AVIOBufferData + { + AVIOBufferData(); + AVIOBufferData(const uint8_t*, size_t size); + + const uint8_t* data = nullptr; + size_t size = 0; + size_t offset = 0; + }; + static int _avIOBufferRead(void* opaque, uint8_t* buf, int bufSize); + static int64_t _avIOBufferSeek(void* opaque, int64_t offset, int whence); + + AVFormatContext* _avFormatContext = nullptr; + AVIOBufferData _avIOBufferData; + uint8_t* _avIOContextBuffer = nullptr; + AVIOContext* _avIOContext = nullptr; + int _avStream = -1; + AVCodecParameters* _avCodecParameters = nullptr; + AVCodecContext* _avCodecContext = nullptr; + AVFrame* _avFrame = nullptr; + SwrContext* _swrContext = nullptr; + bool _eof = false; + + std::vector _residual; + }; + } +} diff --git a/lib/toucanRender/FFmpegAudioWrite.cpp b/lib/toucanRender/FFmpegAudioWrite.cpp new file mode 100644 index 0000000..73ae808 --- /dev/null +++ b/lib/toucanRender/FFmpegAudioWrite.cpp @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#include "FFmpegAudioWrite.h" + +#include +#include + +extern "C" +{ +#include +#include +} + +namespace toucan +{ + namespace ffmpeg + { + AudioWrite::AudioWrite( + const std::filesystem::path& path, + int sampleRate, + int channelCount, + AudioCodec audioCodec) : + _path(path), + _sampleRate(sampleRate), + _channelCount(channelCount) + { + av_log_set_level(AV_LOG_QUIET); + + AVCodecID avCodecID = getAudioCodecId(audioCodec); + + int r = avformat_alloc_output_context2(&_avFormatContext, NULL, NULL, _path.string().c_str()); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + const AVCodec* avCodec = avcodec_find_encoder(avCodecID); + if (!avCodec) + { + throw std::runtime_error("Cannot find audio encoder"); + } + _avCodecContext = avcodec_alloc_context3(avCodec); + if (!_avCodecContext) + { + throw std::runtime_error("Cannot allocate context"); + } + _avAudioStream = avformat_new_stream(_avFormatContext, avCodec); + if (!_avAudioStream) + { + throw std::runtime_error("Cannot allocate stream"); + } + + _avCodecContext->codec_id = avCodec->id; + _avCodecContext->codec_type = AVMEDIA_TYPE_AUDIO; + _avCodecContext->sample_rate = sampleRate; + av_channel_layout_default(&_avCodecContext->ch_layout, channelCount); + _avCodecContext->sample_fmt = avCodec->sample_fmts ? + avCodec->sample_fmts[0] : getAudioSampleFormat(audioCodec); + _avCodecContext->time_base = { 1, sampleRate }; + if (_avFormatContext->oformat->flags & AVFMT_GLOBALHEADER) + { + _avCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; + } + + r = avcodec_open2(_avCodecContext, avCodec, NULL); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + r = avcodec_parameters_from_context(_avAudioStream->codecpar, _avCodecContext); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + _avAudioStream->time_base = { 1, sampleRate }; + + r = avio_open(&_avFormatContext->pb, _path.string().c_str(), AVIO_FLAG_WRITE); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + r = avformat_write_header(_avFormatContext, NULL); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + _avPacket = av_packet_alloc(); + if (!_avPacket) + { + throw std::runtime_error("Cannot allocate packet"); + } + + _frameSize = _avCodecContext->frame_size; + if (_frameSize <= 0) + { + _frameSize = 1024; + } + + _avFrame = av_frame_alloc(); + if (!_avFrame) + { + throw std::runtime_error("Cannot allocate frame"); + } + _avFrame->format = _avCodecContext->sample_fmt; + _avFrame->ch_layout = _avCodecContext->ch_layout; + _avFrame->sample_rate = sampleRate; + _avFrame->nb_samples = _frameSize; + r = av_frame_get_buffer(_avFrame, 0); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + if (_avCodecContext->sample_fmt != AV_SAMPLE_FMT_FLT) + { + AVChannelLayout inLayout; + av_channel_layout_default(&inLayout, channelCount); + + r = swr_alloc_set_opts2( + &_swrContext, + &_avCodecContext->ch_layout, + _avCodecContext->sample_fmt, + sampleRate, + &inLayout, + AV_SAMPLE_FMT_FLT, + sampleRate, + 0, + nullptr); + if (r < 0 || !_swrContext) + { + throw std::runtime_error("Cannot allocate resampler context"); + } + r = swr_init(_swrContext); + if (r < 0) + { + throw std::runtime_error("Cannot initialize resampler"); + } + } + + _opened = true; + } + + AudioWrite::~AudioWrite() + { + if (_opened) + { + _flushFifo(); + _encodeAudio(nullptr); + av_write_trailer(_avFormatContext); + } + if (_swrContext) + { + swr_free(&_swrContext); + } + if (_avFrame) + { + av_frame_free(&_avFrame); + } + if (_avPacket) + { + av_packet_free(&_avPacket); + } + if (_avCodecContext) + { + avcodec_free_context(&_avCodecContext); + } + if (_avFormatContext && _avFormatContext->pb) + { + avio_closep(&_avFormatContext->pb); + } + if (_avFormatContext) + { + avformat_free_context(_avFormatContext); + } + } + + void AudioWrite::writeAudio(const AudioBuffer& buffer) + { + _fifo.insert(_fifo.end(), buffer.data.begin(), buffer.data.end()); + + while (static_cast(_fifo.size()) / _channelCount >= _frameSize) + { + av_frame_make_writable(_avFrame); + _avFrame->nb_samples = _frameSize; + + if (_swrContext) + { + const uint8_t* inBuf = reinterpret_cast(_fifo.data()); + swr_convert( + _swrContext, + _avFrame->extended_data, + _frameSize, + &inBuf, + _frameSize); + } + else + { + memcpy( + _avFrame->data[0], + _fifo.data(), + _frameSize * _channelCount * sizeof(float)); + } + + _fifo.erase(_fifo.begin(), _fifo.begin() + _frameSize * _channelCount); + + _avFrame->pts = _pts; + _pts += _frameSize; + _encodeAudio(_avFrame); + } + } + + void AudioWrite::_flushFifo() + { + if (_fifo.empty()) return; + + const int remainingSamples = static_cast(_fifo.size()) / _channelCount; + if (remainingSamples <= 0) return; + + av_frame_make_writable(_avFrame); + _avFrame->nb_samples = remainingSamples; + + if (_swrContext) + { + const uint8_t* inBuf = reinterpret_cast(_fifo.data()); + swr_convert( + _swrContext, + _avFrame->extended_data, + remainingSamples, + &inBuf, + remainingSamples); + } + else + { + memcpy( + _avFrame->data[0], + _fifo.data(), + remainingSamples * _channelCount * sizeof(float)); + } + + _fifo.clear(); + + _avFrame->pts = _pts; + _pts += remainingSamples; + _encodeAudio(_avFrame); + } + + void AudioWrite::_encodeAudio(AVFrame* frame) + { + int r = avcodec_send_frame(_avCodecContext, frame); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + while (r >= 0) + { + r = avcodec_receive_packet(_avCodecContext, _avPacket); + if (r == AVERROR(EAGAIN) || r == AVERROR_EOF) + { + return; + } + else if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + _avPacket->stream_index = _avAudioStream->index; + r = av_interleaved_write_frame(_avFormatContext, _avPacket); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + av_packet_unref(_avPacket); + } + } + } +} diff --git a/lib/toucanRender/FFmpegAudioWrite.h b/lib/toucanRender/FFmpegAudioWrite.h new file mode 100644 index 0000000..bf3da66 --- /dev/null +++ b/lib/toucanRender/FFmpegAudioWrite.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Contributors to the toucan project. + +#pragma once + +#include +#include + +#include + +extern "C" +{ +#include +#include +#include + +} // extern "C" + +#include +#include + +namespace toucan +{ + namespace ffmpeg + { + class AudioWrite : public std::enable_shared_from_this + { + public: + AudioWrite( + const std::filesystem::path&, + int sampleRate, + int channelCount, + AudioCodec); + + virtual ~AudioWrite(); + + void writeAudio(const AudioBuffer&); + + private: + void _encodeAudio(AVFrame*); + void _flushFifo(); + + std::filesystem::path _path; + int _sampleRate = 0; + int _channelCount = 0; + AVFormatContext* _avFormatContext = nullptr; + AVCodecContext* _avCodecContext = nullptr; + AVStream* _avAudioStream = nullptr; + AVPacket* _avPacket = nullptr; + AVFrame* _avFrame = nullptr; + SwrContext* _swrContext = nullptr; + int64_t _pts = 0; + int _frameSize = 0; + std::vector _fifo; + bool _opened = false; + }; + } +} diff --git a/lib/toucanRender/FFmpegWrite.cpp b/lib/toucanRender/FFmpegWrite.cpp index b1942b4..476e9e8 100644 --- a/lib/toucanRender/FFmpegWrite.cpp +++ b/lib/toucanRender/FFmpegWrite.cpp @@ -14,6 +14,7 @@ extern "C" { #include #include +#include } namespace toucan @@ -24,10 +25,15 @@ namespace toucan const std::filesystem::path& path, const OIIO::ImageSpec& spec, const OTIO_NS::TimeRange& timeRange, - VideoCodec videoCodec) : + VideoCodec videoCodec, + int audioSampleRate, + int audioChannelCount, + AudioCodec audioCodec) : _path(path), _spec(spec), - _timeRange(timeRange) + _timeRange(timeRange), + _audioSampleRate(audioSampleRate), + _audioChannelCount(audioChannelCount) { av_log_set_level(AV_LOG_QUIET); //av_log_set_level(AV_LOG_VERBOSE); @@ -93,6 +99,105 @@ namespace toucan _avVideoStream->time_base = { rational.second, rational.first }; _avVideoStream->avg_frame_rate = { rational.first, rational.second }; + if (audioSampleRate > 0 && audioChannelCount > 0) + { + AVCodecID audioCodecId = getAudioCodecId(audioCodec); + const AVCodec* audioAvCodec = avcodec_find_encoder(audioCodecId); + if (!audioAvCodec) + { + throw std::runtime_error("Cannot find audio encoder"); + } + _avAudioCodecContext = avcodec_alloc_context3(audioAvCodec); + if (!_avAudioCodecContext) + { + throw std::runtime_error("Cannot allocate audio context"); + } + _avAudioStream = avformat_new_stream(_avFormatContext, audioAvCodec); + if (!_avAudioStream) + { + throw std::runtime_error("Cannot allocate audio stream"); + } + + _avAudioCodecContext->codec_id = audioAvCodec->id; + _avAudioCodecContext->codec_type = AVMEDIA_TYPE_AUDIO; + _avAudioCodecContext->sample_rate = audioSampleRate; + av_channel_layout_default(&_avAudioCodecContext->ch_layout, audioChannelCount); + _avAudioCodecContext->sample_fmt = audioAvCodec->sample_fmts ? + audioAvCodec->sample_fmts[0] : getAudioSampleFormat(audioCodec); + _avAudioCodecContext->time_base = { 1, audioSampleRate }; + if (_avFormatContext->oformat->flags & AVFMT_GLOBALHEADER) + { + _avAudioCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; + } + + r = avcodec_open2(_avAudioCodecContext, audioAvCodec, NULL); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + r = avcodec_parameters_from_context(_avAudioStream->codecpar, _avAudioCodecContext); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + _avAudioStream->time_base = { 1, audioSampleRate }; + + _audioFrameSize = _avAudioCodecContext->frame_size; + if (_audioFrameSize <= 0) + { + _audioFrameSize = 1024; + } + + _avAudioPacket = av_packet_alloc(); + if (!_avAudioPacket) + { + throw std::runtime_error("Cannot allocate audio packet"); + } + + _avAudioFrame = av_frame_alloc(); + if (!_avAudioFrame) + { + throw std::runtime_error("Cannot allocate audio frame"); + } + _avAudioFrame->format = _avAudioCodecContext->sample_fmt; + _avAudioFrame->ch_layout = _avAudioCodecContext->ch_layout; + _avAudioFrame->sample_rate = audioSampleRate; + _avAudioFrame->nb_samples = _audioFrameSize; + r = av_frame_get_buffer(_avAudioFrame, 0); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + if (_avAudioCodecContext->sample_fmt != AV_SAMPLE_FMT_FLT) + { + AVChannelLayout inLayout; + av_channel_layout_default(&inLayout, audioChannelCount); + + r = swr_alloc_set_opts2( + &_swrContext, + &_avAudioCodecContext->ch_layout, + _avAudioCodecContext->sample_fmt, + audioSampleRate, + &inLayout, + AV_SAMPLE_FMT_FLT, + audioSampleRate, + 0, + nullptr); + if (r < 0 || !_swrContext) + { + throw std::runtime_error("Cannot allocate resampler context"); + } + r = swr_init(_swrContext); + if (r < 0) + { + throw std::runtime_error("Cannot initialize resampler"); + } + } + } + //av_dump_format(_avFormatContext, 0, _path.string().c_str(), 1); r = avio_open(&_avFormatContext->pb, _path.string().c_str(), AVIO_FLAG_WRITE); @@ -141,8 +246,29 @@ namespace toucan if (_opened) { _encodeVideo(nullptr); + if (_avAudioCodecContext) + { + _flushAudioFifo(); + _encodeAudio(nullptr); + } av_write_trailer(_avFormatContext); } + if (_swrContext) + { + swr_free(&_swrContext); + } + if (_avAudioFrame) + { + av_frame_free(&_avAudioFrame); + } + if (_avAudioPacket) + { + av_packet_free(&_avAudioPacket); + } + if (_avAudioCodecContext) + { + avcodec_free_context(&_avAudioCodecContext); + } if (_swsContext) { sws_freeContext(_swsContext); @@ -299,6 +425,44 @@ namespace toucan _encodeVideo(_avFrame); } + void Write::writeAudio(const AudioBuffer& buffer) + { + if (!_avAudioCodecContext) return; + + _audioFifo.insert(_audioFifo.end(), buffer.data.begin(), buffer.data.end()); + + while (static_cast(_audioFifo.size()) / _audioChannelCount >= _audioFrameSize) + { + av_frame_make_writable(_avAudioFrame); + _avAudioFrame->nb_samples = _audioFrameSize; + + if (_swrContext) + { + const uint8_t* inBuf = reinterpret_cast(_audioFifo.data()); + swr_convert( + _swrContext, + _avAudioFrame->extended_data, + _audioFrameSize, + &inBuf, + _audioFrameSize); + } + else + { + memcpy( + _avAudioFrame->data[0], + _audioFifo.data(), + _audioFrameSize * _audioChannelCount * sizeof(float)); + } + + _audioFifo.erase(_audioFifo.begin(), + _audioFifo.begin() + _audioFrameSize * _audioChannelCount); + + _avAudioFrame->pts = _audioPts; + _audioPts += _audioFrameSize; + _encodeAudio(_avAudioFrame); + } + } + void Write::_encodeVideo(AVFrame* frame) { int r = avcodec_send_frame(_avCodecContext, frame); @@ -326,5 +490,69 @@ namespace toucan av_packet_unref(_avPacket); } } + + void Write::_encodeAudio(AVFrame* frame) + { + int r = avcodec_send_frame(_avAudioCodecContext, frame); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + + while (r >= 0) + { + r = avcodec_receive_packet(_avAudioCodecContext, _avAudioPacket); + if (r == AVERROR(EAGAIN) || r == AVERROR_EOF) + { + return; + } + else if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + _avAudioPacket->stream_index = _avAudioStream->index; + r = av_interleaved_write_frame(_avFormatContext, _avAudioPacket); + if (r < 0) + { + throw std::runtime_error(getErrorLabel(r)); + } + av_packet_unref(_avAudioPacket); + } + } + + void Write::_flushAudioFifo() + { + if (_audioFifo.empty()) return; + + const int remainingSamples = static_cast(_audioFifo.size()) / _audioChannelCount; + if (remainingSamples <= 0) return; + + av_frame_make_writable(_avAudioFrame); + _avAudioFrame->nb_samples = remainingSamples; + + if (_swrContext) + { + const uint8_t* inBuf = reinterpret_cast(_audioFifo.data()); + swr_convert( + _swrContext, + _avAudioFrame->extended_data, + remainingSamples, + &inBuf, + remainingSamples); + } + else + { + memcpy( + _avAudioFrame->data[0], + _audioFifo.data(), + remainingSamples * _audioChannelCount * sizeof(float)); + } + + _audioFifo.clear(); + + _avAudioFrame->pts = _audioPts; + _audioPts += remainingSamples; + _encodeAudio(_avAudioFrame); + } } } diff --git a/lib/toucanRender/FFmpegWrite.h b/lib/toucanRender/FFmpegWrite.h index 13f797c..5f88dc7 100644 --- a/lib/toucanRender/FFmpegWrite.h +++ b/lib/toucanRender/FFmpegWrite.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -12,11 +13,13 @@ extern "C" { #include +#include #include } // extern "C" #include +#include namespace toucan { @@ -29,14 +32,20 @@ namespace toucan const std::filesystem::path&, const OIIO::ImageSpec&, const OTIO_NS::TimeRange&, - VideoCodec); + VideoCodec, + int audioSampleRate = 0, + int audioChannelCount = 0, + AudioCodec audioCodec = AudioCodec::PCM_S16LE); virtual ~Write(); void writeImage(const OIIO::ImageBuf&, const OTIO_NS::RationalTime&); + void writeAudio(const AudioBuffer&); private: void _encodeVideo(AVFrame*); + void _encodeAudio(AVFrame*); + void _flushAudioFifo(); std::filesystem::path _path; OIIO::ImageSpec _spec; @@ -50,6 +59,17 @@ namespace toucan AVFrame* _avFrame2 = nullptr; SwsContext* _swsContext = nullptr; bool _opened = false; + + AVCodecContext* _avAudioCodecContext = nullptr; + AVStream* _avAudioStream = nullptr; + AVPacket* _avAudioPacket = nullptr; + AVFrame* _avAudioFrame = nullptr; + SwrContext* _swrContext = nullptr; + int64_t _audioPts = 0; + int _audioSampleRate = 0; + int _audioChannelCount = 0; + int _audioFrameSize = 0; + std::vector _audioFifo; }; } } diff --git a/lib/toucanRender/TimelineAlgo.cpp b/lib/toucanRender/TimelineAlgo.cpp index 81207d4..0b5996f 100644 --- a/lib/toucanRender/TimelineAlgo.cpp +++ b/lib/toucanRender/TimelineAlgo.cpp @@ -24,4 +24,22 @@ namespace toucan } return out; } + + std::vector > + getAudioClips(const OTIO_NS::SerializableObject::Retainer& timeline) + { + std::vector > out; + for (const auto& child : timeline->tracks()->children()) + { + if (auto track = OTIO_NS::dynamic_retainer_cast(child)) + { + if (OTIO_NS::Track::Kind::audio == track->kind()) + { + const auto clips = track->find_clips(nullptr, std::nullopt, true); + out.insert(out.end(), clips.begin(), clips.end()); + } + } + } + return out; + } } diff --git a/lib/toucanRender/TimelineAlgo.h b/lib/toucanRender/TimelineAlgo.h index b47a79e..c24ad87 100644 --- a/lib/toucanRender/TimelineAlgo.h +++ b/lib/toucanRender/TimelineAlgo.h @@ -10,4 +10,8 @@ namespace toucan //! Get the video clips in a timeline. std::vector > getVideoClips(const OTIO_NS::SerializableObject::Retainer&); + + //! Get the audio clips in a timeline. + std::vector > + getAudioClips(const OTIO_NS::SerializableObject::Retainer&); }