6#include <libavcodec/avcodec.h>
7#include <libavformat/avformat.h>
8#include <libavutil/channel_layout.h>
9#include <libavutil/opt.h>
10#include <libavutil/samplefmt.h>
11#include <libswresample/swresample.h>
64std::unordered_map<std::string, Kakshya::RegionGroup>
67 std::unordered_map<std::string, Kakshya::RegionGroup> groups;
69 for (
const auto& region : regions) {
70 auto& group = groups[region.type];
71 group.name = region.type;
72 group.add_region(region.to_region());
103 av_log_set_level(AV_LOG_WARNING);
113 AVFormatContext* format_ctx =
nullptr;
114 int ret = avformat_open_input(&format_ctx, filepath.c_str(),
nullptr,
nullptr);
120 bool has_audio =
false;
121 if (avformat_find_stream_info(format_ctx,
nullptr) >= 0) {
122 int audio_stream = av_find_best_stream(format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1,
nullptr, 0);
123 has_audio = (audio_stream >= 0);
126 avformat_close_input(&format_ctx);
145 auto ctx = std::make_shared<FFmpegContext>();
146 if (avformat_open_input(&ctx->format_context, filepath.c_str(),
nullptr,
nullptr) < 0) {
147 set_error(
"Failed to open file: " + filepath);
151 if (avformat_find_stream_info(ctx->format_context,
nullptr) < 0) {
156 const AVCodec* codec =
nullptr;
157 ctx->audio_stream_index = av_find_best_stream(
158 ctx->format_context, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
160 if (ctx->audio_stream_index < 0 || !codec) {
165 ctx->codec_context = avcodec_alloc_context3(codec);
166 if (!ctx->codec_context) {
167 set_error(
"Failed to allocate codec context");
171 AVStream* stream = ctx->format_context->streams[ctx->audio_stream_index];
172 if (avcodec_parameters_to_context(ctx->codec_context, stream->codecpar) < 0) {
173 set_error(
"Failed to copy codec parameters");
177 if (avcodec_open2(ctx->codec_context, codec,
nullptr) < 0) {
182 if (stream->duration != AV_NOPTS_VALUE && stream->time_base.num && stream->time_base.den) {
183 double duration_seconds = stream->duration * av_q2d(stream->time_base);
184 ctx->total_frames =
static_cast<uint64_t
>(duration_seconds * ctx->codec_context->sample_rate);
185 }
else if (ctx->format_context->duration != AV_NOPTS_VALUE) {
186 double duration_seconds = ctx->format_context->duration /
static_cast<double>(AV_TIME_BASE);
187 ctx->total_frames =
static_cast<uint64_t
>(duration_seconds * ctx->codec_context->sample_rate);
189 ctx->total_frames = 0;
192 ctx->sample_rate = ctx->codec_context->sample_rate;
193 ctx->channels = ctx->codec_context->ch_layout.nb_channels;
199 if (!ctx->is_valid()) {
200 set_error(
"Invalid context after initialization");
242 if (!ctx || !ctx->codec_context) {
246 AVChannelLayout out_ch_layout;
247 av_channel_layout_copy(&out_ch_layout, &ctx->codec_context->ch_layout);
255 int ret = swr_alloc_set_opts2(&ctx->swr_context,
256 &out_ch_layout, out_sample_fmt, out_sample_rate,
257 &ctx->codec_context->ch_layout, ctx->codec_context->sample_fmt,
258 ctx->codec_context->sample_rate,
261 av_channel_layout_uninit(&out_ch_layout);
263 if (ret < 0 || !ctx->swr_context) {
264 set_error(
"Failed to allocate resampler");
268 if (swr_init(ctx->swr_context) < 0) {
269 set_error(
"Failed to initialize resampler");
298 metadata.
format = ctx->format_context->iformat->name;
299 metadata.
mime_type = ctx->format_context->iformat->mime_type
300 ? ctx->format_context->iformat->mime_type
301 :
"audio/" + std::string(ctx->format_context->iformat->name);
304 auto ftime = std::filesystem::last_write_time(
m_filepath);
306 std::chrono::seconds(std::chrono::duration_cast<std::chrono::seconds>(
307 ftime.time_since_epoch())));
309 metadata.
attributes[
"codec"] = avcodec_get_name(ctx->codec_context->codec_id);
310 metadata.
attributes[
"codec_long_name"] = ctx->codec_context->codec->long_name;
311 metadata.
attributes[
"total_frames"] = ctx->total_frames;
312 metadata.
attributes[
"sample_rate"] = ctx->sample_rate;
313 metadata.
attributes[
"channels"] = ctx->channels;
315 char layout_desc[256];
316 av_channel_layout_describe(&ctx->codec_context->ch_layout, layout_desc,
sizeof(layout_desc));
317 metadata.
attributes[
"channel_layout"] = std::string(layout_desc);
318 metadata.
attributes[
"bit_rate"] = ctx->codec_context->bit_rate;
320 if (ctx->format_context->duration != AV_NOPTS_VALUE) {
321 metadata.
attributes[
"duration_seconds"] = ctx->format_context->duration /
static_cast<double>(AV_TIME_BASE);
322 }
else if (ctx->total_frames > 0) {
323 metadata.
attributes[
"duration_seconds"] = ctx->total_frames /
static_cast<double>(ctx->sample_rate);
326 AVDictionaryEntry* tag =
nullptr;
327 while ((tag = av_dict_get(ctx->format_context->metadata,
"", tag, AV_DICT_IGNORE_SUFFIX))) {
328 metadata.
attributes[std::string(
"tag_") + tag->key] = tag->value;
331 AVStream* stream = ctx->format_context->streams[ctx->audio_stream_index];
333 while ((tag = av_dict_get(stream->metadata,
"", tag, AV_DICT_IGNORE_SUFFIX))) {
334 metadata.
attributes[std::string(
"stream_") + tag->key] = tag->value;
347 if (!ctx || !ctx->is_valid()) {
355 metadata.
format = ctx->format_context->iformat->name;
356 metadata.
mime_type = ctx->format_context->iformat->mime_type
357 ? ctx->format_context->iformat->mime_type
358 :
"audio/" + std::string(ctx->format_context->iformat->name);
361 auto ftime = std::filesystem::last_write_time(
m_filepath);
363 std::chrono::seconds(std::chrono::duration_cast<std::chrono::seconds>(
364 ftime.time_since_epoch())));
366 metadata.
attributes[
"codec"] = avcodec_get_name(ctx->codec_context->codec_id);
367 metadata.
attributes[
"codec_long_name"] = ctx->codec_context->codec->long_name;
368 metadata.
attributes[
"total_frames"] = ctx->total_frames;
369 metadata.
attributes[
"sample_rate"] = ctx->sample_rate;
370 metadata.
attributes[
"channels"] = ctx->channels;
372 char layout_desc[256];
373 av_channel_layout_describe(&ctx->codec_context->ch_layout, layout_desc,
sizeof(layout_desc));
374 metadata.
attributes[
"channel_layout"] = std::string(layout_desc);
375 metadata.
attributes[
"bit_rate"] = ctx->codec_context->bit_rate;
377 if (ctx->format_context->duration != AV_NOPTS_VALUE) {
378 metadata.
attributes[
"duration_seconds"] = ctx->format_context->duration /
static_cast<double>(AV_TIME_BASE);
379 }
else if (ctx->total_frames > 0) {
380 metadata.
attributes[
"duration_seconds"] = ctx->total_frames /
static_cast<double>(ctx->sample_rate);
383 AVDictionaryEntry* tag =
nullptr;
384 while ((tag = av_dict_get(ctx->format_context->metadata,
"", tag, AV_DICT_IGNORE_SUFFIX))) {
385 metadata.
attributes[std::string(
"tag_") + tag->key] = tag->value;
388 AVStream* stream = ctx->format_context->streams[ctx->audio_stream_index];
390 while ((tag = av_dict_get(stream->metadata,
"", tag, AV_DICT_IGNORE_SUFFIX))) {
391 metadata.
attributes[std::string(
"stream_") + tag->key] = tag->value;
399 if (!ctx || !ctx->is_valid()) {
406 for (
unsigned int i = 0; i < ctx->format_context->nb_chapters; i++) {
407 AVChapter* chapter = ctx->format_context->chapters[i];
410 region.
type =
"chapter";
412 uint64_t start = av_rescale_q(chapter->start, chapter->time_base,
413 AVRational { 1, static_cast<int>(ctx->sample_rate) });
414 uint64_t end = av_rescale_q(chapter->end, chapter->time_base,
415 AVRational { 1, static_cast<int>(ctx->sample_rate) });
420 AVDictionaryEntry* entry =
nullptr;
421 while ((entry = av_dict_get(chapter->metadata,
"", entry, AV_DICT_IGNORE_SUFFIX))) {
422 if (strcmp(entry->key,
"title") == 0) {
423 region.
name = entry->value;
429 if (region.
name.empty()) {
430 region.
name =
"Chapter " + std::to_string(i + 1);
436 AVDictionaryEntry* tag =
nullptr;
437 while ((tag = av_dict_get(ctx->format_context->metadata,
"", tag, AV_DICT_IGNORE_SUFFIX))) {
438 std::string key = tag->key;
439 if (key.find(
"cue") != std::string::npos || key.find(
"CUE") != std::string::npos) {
443 region.
attributes[
"description"] = tag->value;
446 uint64_t position = std::stoull(tag->value);
457 if (key.find(
"loop") != std::string::npos || key.find(
"LOOP") != std::string::npos) {
459 region.
type =
"loop";
531 set_error(
"File closed during operation");
544 set_error(
"File closed during operation");
555 std::shared_ptr<FFmpegContext> ctx,
559 if (!ctx || !ctx->is_valid() || !ctx->swr_context) {
560 set_error(
"Invalid context for decoding");
564 std::vector<Kakshya::DataVariant> output_data;
565 uint64_t frames_decoded = 0;
567 AVPacket* packet = av_packet_alloc();
568 AVFrame* frame = av_frame_alloc();
570 if (!packet || !frame) {
571 av_packet_free(&packet);
572 av_frame_free(&frame);
573 set_error(
"Failed to allocate packet/frame");
577 int channels = ctx->channels;
581 output_data.resize(channels);
582 for (
auto& channel_vector : output_data) {
583 channel_vector = std::vector<double>();
584 std::get<std::vector<double>>(channel_vector).reserve(num_frames);
587 output_data.resize(1);
588 output_data[0] = std::vector<double>();
589 std::get<std::vector<double>>(output_data[0]).reserve(num_frames * channels);
592 uint8_t** resample_buffer =
nullptr;
593 int resample_linesize = 0;
595 int max_resample_samples = av_rescale_rnd(
601 AVSampleFormat target_format = use_planar ? AV_SAMPLE_FMT_DBLP : AV_SAMPLE_FMT_DBL;
603 int alloc_ret = av_samples_alloc_array_and_samples(
604 &resample_buffer, &resample_linesize,
605 channels, max_resample_samples, target_format, 0);
607 if (alloc_ret < 0 || !resample_buffer) {
608 av_packet_free(&packet);
609 av_frame_free(&frame);
610 set_error(
"Failed to allocate resample buffer");
614 while (frames_decoded < num_frames) {
615 int ret = av_read_frame(ctx->format_context, packet);
618 if (ret == AVERROR_EOF) {
619 avcodec_send_packet(ctx->codec_context,
nullptr);
623 }
else if (packet->stream_index != ctx->audio_stream_index) {
624 av_packet_unref(packet);
627 ret = avcodec_send_packet(ctx->codec_context, packet);
628 av_packet_unref(packet);
630 if (ret < 0 && ret != AVERROR(EAGAIN)) {
635 while (ret >= 0 && frames_decoded < num_frames) {
636 ret = avcodec_receive_frame(ctx->codec_context, frame);
638 if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
640 }
else if (ret < 0) {
644 int out_samples = swr_convert(ctx->swr_context,
645 resample_buffer, max_resample_samples,
646 (
const uint8_t**)frame->data, frame->nb_samples);
648 if (out_samples > 0) {
649 uint64_t samples_to_copy = std::min(
650 static_cast<uint64_t
>(out_samples),
651 num_frames - frames_decoded);
654 for (
int ch = 0; ch < channels; ++ch) {
655 double* channel_data =
reinterpret_cast<double*
>(resample_buffer[ch]);
656 auto& channel_vector = std::get<std::vector<double>>(output_data[ch]);
657 channel_vector.insert(channel_vector.end(),
658 channel_data, channel_data + samples_to_copy);
661 double* interleaved_data =
reinterpret_cast<double*
>(resample_buffer[0]);
662 auto& interleaved_vector = std::get<std::vector<double>>(output_data[0]);
663 interleaved_vector.insert(interleaved_vector.end(),
664 interleaved_data, interleaved_data + samples_to_copy * channels);
667 frames_decoded += samples_to_copy;
670 av_frame_unref(frame);
673 if (ret == AVERROR_EOF) {
678 av_frame_free(&frame);
679 av_packet_free(&packet);
681 if (resample_buffer) {
682 av_freep(&resample_buffer[0]);
683 av_freep(&resample_buffer);
699 uint64_t num_frames = (end > start) ? (end - start) : 1;
715 if (position.empty()) {
732 if (!ctx || !ctx->is_valid()) {
733 set_error(
"Invalid context for seeking");
737 if (frame_position > ctx->total_frames) {
738 frame_position = ctx->total_frames;
741 if (ctx->sample_rate == 0) {
746 if (ctx->audio_stream_index < 0 || ctx->audio_stream_index >=
static_cast<int>(ctx->format_context->nb_streams)) {
751 AVStream* stream = ctx->format_context->streams[ctx->audio_stream_index];
753 int64_t timestamp = av_rescale_q(
755 AVRational { 1,
static_cast<int>(ctx->sample_rate) },
758 int ret = av_seek_frame(
760 ctx->audio_stream_index,
762 AVSEEK_FLAG_BACKWARD);
769 avcodec_flush_buffers(ctx->codec_context);
771 if (ctx->swr_context) {
772 uint8_t** dummy =
nullptr;
775 int alloc_ret = av_samples_alloc_array_and_samples(
777 ctx->channels, 2048, AV_SAMPLE_FMT_DBL, 0);
779 if (alloc_ret >= 0 && dummy) {
780 while (swr_convert(ctx->swr_context, dummy, 2048,
nullptr, 0) > 0) {
805 auto container = std::make_shared<Kakshya::SoundFileContainer>();
822 auto sound_container = std::dynamic_pointer_cast<Kakshya::SoundFileContainer>(container);
823 if (!sound_container) {
824 set_error(
"Container is not a SoundFileContainer");
834 auto total_frames = metadata->get_attribute<uint64_t>(
"total_frames").value_or(0);
835 auto sample_rate = metadata->get_attribute<uint32_t>(
"sample_rate").value_or(48000);
836 auto channels = metadata->get_attribute<uint32_t>(
"channels").value_or(2);
838 sound_container->setup(total_frames, sample_rate, channels);
846 std::vector<Kakshya::DataVariant> audio_data =
read_all();
848 if (audio_data.empty()) {
853 sound_container->set_raw_data(audio_data);
857 for (
const auto& [name, group] : region_groups) {
858 sound_container->add_region_group(group);
861 sound_container->create_default_processor();
862 sound_container->mark_ready_for_processing(
true);
876 if (
m_context->codec_context->frame_size > 0) {
877 return m_context->codec_context->frame_size * 4;
902 "wav",
"flac",
"mp3",
"m4a",
"aac",
"ogg",
"opus",
"wma",
903 "aiff",
"aif",
"ape",
"wv",
"tta",
"mka",
"ac3",
"dts",
904 "mp2",
"mp4",
"webm",
"caf",
"amr",
"au",
"voc",
"w64",
905 "mpc",
"mp+",
"m4b",
"m4r",
"3gp",
"3g2",
"asf",
"rm",
906 "ra",
"avi",
"mov",
"mkv",
"ogv",
"ogx",
"oga",
"spx",
907 "f4a",
"f4b",
"f4v",
"m4v",
"asx",
"wvx",
"wax"
941 const std::vector<double>& interleaved, uint32_t channels)
944 return { interleaved };
947 std::vector<std::vector<double>> deinterleaved(channels);
949 size_t samples_per_channel = interleaved.size() / channels;
951 for (uint32_t ch = 0; ch < channels; ch++) {
952 deinterleaved[ch].reserve(samples_per_channel);
953 for (
size_t i = 0; i < samples_per_channel; i++) {
954 deinterleaved[ch].push_back(interleaved[i * channels + ch]);
958 return deinterleaved;
static std::unordered_map< std::string, Kakshya::RegionGroup > regions_to_groups(const std::vector< FileRegion > ®ions)
Convert file regions to region groups.
std::vector< Kakshya::DataVariant > read_all() override
Read the entire audio file into memory.
void close() override
Close the currently open file and release resources.
std::string get_last_error() const override
Get the last error message encountered by the reader.
uint64_t get_preferred_chunk_size() const override
Get the preferred chunk size for streaming reads.
uint32_t m_target_sample_rate
Target sample rate for resampling (0 = use source rate).
bool supports_streaming() const override
Check if the reader supports streaming access.
bool open(const std::string &filepath, FileReadOptions options=FileReadOptions::ALL) override
Open an audio file for reading.
bool load_into_container(std::shared_ptr< Kakshya::SignalSourceContainer > container) override
Load file data into an existing SignalSourceContainer.
std::shared_mutex m_context_mutex
bool can_read(const std::string &filepath) const override
Check if this reader can open the given file.
std::mutex m_metadata_mutex
Mutex for thread-safe metadata access.
static std::atomic< bool > s_ffmpeg_initialized
True if FFmpeg has been initialized.
std::vector< Kakshya::DataVariant > decode_frames(std::shared_ptr< FFmpegContext > ctx, uint64_t num_frames, uint64_t offset)
Decode a specific number of frames from the file.
bool seek_internal(std::shared_ptr< FFmpegContext > &ctx, uint64_t frame_position)
Internal seek implementation.
std::string m_last_error
Last error message encountered.
void extract_regions(const std::shared_ptr< FFmpegContext > &ctx)
Extract region information from the file.
std::vector< uint64_t > get_read_position() const override
Get the current read position in the file.
void set_error(const std::string &error) const
Set the last error message.
std::atomic< uint64_t > m_current_frame_position
Current frame position for reading.
void extract_metadata(const std::shared_ptr< FFmpegContext > &ctx)
Extract metadata from the file.
bool setup_resampler(const std::shared_ptr< FFmpegContext > &ctx)
Set up the FFmpeg resampler if needed.
std::vector< Kakshya::DataVariant > read_region(const FileRegion ®ion) override
Read a specific region from the file.
std::vector< Kakshya::DataVariant > read_frames(uint64_t num_frames, uint64_t offset=0)
Read a specific number of frames from the file.
AudioReadOptions m_audio_options
Audio-specific read options.
std::shared_ptr< FFmpegContext > m_context
std::optional< FileMetadata > m_cached_metadata
Cached file metadata.
std::vector< FileRegion > m_cached_regions
Cached file regions (markers, loops, etc.).
void clear_error() const
Clear the last error message.
~SoundFileReader() override
Destroy the SoundFileReader object.
std::string m_filepath
Path to the currently open file.
std::vector< uint64_t > get_dimension_sizes() const override
Get the size of each dimension (e.g., frames, channels).
std::shared_ptr< Kakshya::SignalSourceContainer > create_container() override
Create a SignalSourceContainer for this file.
bool seek(const std::vector< uint64_t > &position) override
Seek to a specific position in the file.
SoundFileReader()
Construct a new SoundFileReader object.
std::vector< std::vector< double > > deinterleave_data(const std::vector< double > &interleaved, uint32_t channels)
Convert interleaved audio data to deinterleaved (planar) format.
size_t get_num_dimensions() const override
Get the number of dimensions in the audio data (typically 2: time, channel).
std::optional< FileMetadata > get_metadata() const override
Get metadata for the currently open file.
std::vector< FileRegion > get_regions() const override
Get all regions (markers, loops, etc.) from the file.
bool is_open() const override
Check if a file is currently open.
FileReadOptions m_options
File read options used for this session.
std::vector< std::string > get_supported_extensions() const override
Get supported file extensions for this reader.
static void initialize_ffmpeg()
Initialize FFmpeg libraries (thread-safe, called automatically).
static std::mutex s_ffmpeg_init_mutex
Mutex for FFmpeg initialization.
FileReadOptions
Generic options for file reading behavior.
@ EXTRACT_METADATA
Extract file metadata.
@ EXTRACT_REGIONS
Extract semantic regions (format-specific)
@ NONE
No special options.
@ PLANAR
Separate DataVariant per logical unit (LLL...RRR for stereo)
@ INTERLEAVED
Single DataVariant with interleaved data (LRLRLR for stereo)
AVCodecContext * codec_context
AVFormatContext * format_context
std::vector< uint64_t > start_coordinates
N-dimensional start position (e.g., frame, x, y)
Kakshya::Region to_region() const
Convert this FileRegion to a Region for use in processing.
std::string name
Human-readable name for the region.
std::string type
Region type identifier (e.g., "cue", "scene", "block")
std::unordered_map< std::string, std::any > attributes
Region-specific metadata.
std::vector< uint64_t > end_coordinates
N-dimensional end position (inclusive)
Generic region descriptor for any file type.
static Region time_span(uint64_t start_frame, uint64_t end_frame, const std::string &label="", const std::any &extra_data={})
Create a Region representing a time span (e.g., a segment of frames).
void set_attribute(const std::string &key, std::any value)
Set an attribute value by key.
static Region time_point(uint64_t frame, const std::string &label="", const std::any &extra_data={})
Create a Region representing a single time point (e.g., a frame or sample).
Represents a point or span in N-dimensional space.