From ce39e543087c47f64edc30ab2548f9dd9284b1b0 Mon Sep 17 00:00:00 2001 From: Gabriel Jablonski Date: Sun, 22 Feb 2026 16:21:50 -0300 Subject: [PATCH] feat: add audio transcoding support for WhatsApp Cloud API (#220) * feat: add audio transcoding support for WhatsApp Cloud API - Introduced `Audio::TranscodeService` to handle audio transcoding to OGG/Opus format. - Updated `Messages::MessageBuilder` to transcode audio attachments based on `transcode_audio` parameter. - Enhanced `WhatsappCloudService` to normalize audio content types and send voice flag for recorded audio in OGG format. - Added utility functions for audio conversion in JavaScript. - Updated Dockerfile to include FFmpeg for audio processing. - Added tests for audio transcoding and WhatsApp Cloud service interactions. * feat: enhance audio handling with transcoding support and error management * feat: improve audio transcoding error handling and enhance audio recording features * feat: enhance audio transcoding process and error handling for better reliability * feat: update recorded audio handling to support boolean and array formats --- Gemfile | 1 + Gemfile.lock | 3 + app/builders/messages/message_builder.rb | 27 +- app/javascript/dashboard/api/inbox/message.js | 10 +- .../widgets/WootWriter/AudioRecorder.vue | 73 ++- ...ersionUtils.js => audioConversionUtils.js} | 7 +- .../utils/specs/webmOpusToOgg.spec.js | 354 ++++++++++++++ .../widgets/WootWriter/utils/webmOpusToOgg.js | 454 ++++++++++++++++++ .../widgets/conversation/ReplyBox.vue | 18 + .../i18n/locale/en/conversation.json | 1 + .../i18n/locale/pt_BR/conversation.json | 1 + app/services/audio/transcode_service.rb | 78 +++ .../providers/whatsapp_cloud_service.rb | 27 +- docker/Dockerfile | 1 + lib/custom_exceptions/audio.rb | 13 + .../builders/messages/message_builder_spec.rb | 44 ++ spec/services/audio/transcode_service_spec.rb | 106 ++++ .../providers/whatsapp_cloud_service_spec.rb | 52 +- 18 files changed, 1223 insertions(+), 47 deletions(-) rename app/javascript/dashboard/components/widgets/WootWriter/utils/{mp3ConversionUtils.js => audioConversionUtils.js} (94%) create mode 100644 app/javascript/dashboard/components/widgets/WootWriter/utils/specs/webmOpusToOgg.spec.js create mode 100644 app/javascript/dashboard/components/widgets/WootWriter/utils/webmOpusToOgg.js create mode 100644 app/services/audio/transcode_service.rb create mode 100644 lib/custom_exceptions/audio.rb create mode 100644 spec/services/audio/transcode_service_spec.rb diff --git a/Gemfile b/Gemfile index 0edf25609..faec53f56 100644 --- a/Gemfile +++ b/Gemfile @@ -54,6 +54,7 @@ gem 'aws-sdk-s3', require: false gem 'azure-storage-blob', git: 'https://github.com/chatwoot/azure-storage-ruby', branch: 'chatwoot', require: false gem 'google-cloud-storage', '>= 1.48.0', require: false gem 'image_processing' +gem 'streamio-ffmpeg', '~> 3.0' ##-- for actionmailbox --## gem 'aws-actionmailbox-ses', '~> 0' diff --git a/Gemfile.lock b/Gemfile.lock index df1d54775..4f88969f1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -937,6 +937,8 @@ GEM squasher (0.7.2) stackprof (0.2.25) statsd-ruby (1.5.0) + streamio-ffmpeg (3.0.2) + multi_json (~> 1.8) stripe (18.0.1) telephone_number (1.4.20) test-prof (1.2.1) @@ -1151,6 +1153,7 @@ DEPENDENCIES spring-watcher-listen squasher stackprof + streamio-ffmpeg (~> 3.0) stripe (~> 18.0) telephone_number test-prof diff --git a/app/builders/messages/message_builder.rb b/app/builders/messages/message_builder.rb index a949eae36..91d4df118 100644 --- a/app/builders/messages/message_builder.rb +++ b/app/builders/messages/message_builder.rb @@ -14,6 +14,7 @@ class Messages::MessageBuilder # rubocop:disable Metrics/ClassLength @message_type = params[:message_type] || 'outgoing' @attachments = params[:attachments] @is_recorded_audio = params[:is_recorded_audio] + @transcode_audio = params[:transcode_audio] @attachments_metadata = normalize_attachments_metadata(params[:attachments_metadata]) @automation_rule = content_attributes&.dig(:automation_rule_id) return unless params.instance_of?(ActionController::Parameters) @@ -67,6 +68,7 @@ class Messages::MessageBuilder # rubocop:disable Metrics/ClassLength else file_type(uploaded_attachment&.content_type) end + transcode_attachment(attachment, file_like_source(uploaded_attachment)) if should_transcode?(attachment) end end @@ -78,9 +80,9 @@ class Messages::MessageBuilder # rubocop:disable Metrics/ClassLength end def recorded_audio_metadata(attachment) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity - # NOTE: `is_recorded_audio` can be either a boolean or an array of file names. + # NOTE: `is_recorded_audio` can be either a boolean, the string "true", or an array of file names. return unless @is_recorded_audio - return { is_recorded_audio: true } if @is_recorded_audio == true + return { is_recorded_audio: true } if @is_recorded_audio == true || @is_recorded_audio == 'true' return { is_recorded_audio: true } if @is_recorded_audio.is_a?(Array) && attachment.original_filename.in?(@is_recorded_audio) @@ -110,6 +112,27 @@ class Messages::MessageBuilder # rubocop:disable Metrics/ClassLength metadata.deep_stringify_keys end + def should_transcode?(attachment) + @transcode_audio.present? && attachment.file_type == 'audio' + end + + # Returns the uploaded file only when it's a real file-like object (ActionDispatch::Http::UploadedFile, + # Tempfile, etc.). Direct-upload signed-ID Strings are not usable as source files for transcoding; + # TranscodeService falls back to downloading from the blob in that case. + def file_like_source(uploaded_attachment) + return uploaded_attachment if uploaded_attachment.respond_to?(:path) || uploaded_attachment.respond_to?(:tempfile) + end + + def transcode_attachment(attachment, uploaded_file = nil) + Audio::TranscodeService.new(attachment, @transcode_audio, source_file: uploaded_file).perform + attachment.meta ||= {} + attachment.meta['is_recorded_audio'] = true + rescue CustomExceptions::Audio::UnsupportedFormatError, CustomExceptions::Audio::TranscodingError => e + Rails.logger.error("Audio transcoding failed, keeping original attachment: #{e.message}") + attachment.meta ||= {} + attachment.meta['audio_transcoding_failed'] = true + end + def process_emails return unless @conversation.inbox&.inbox_type == 'Email' diff --git a/app/javascript/dashboard/api/inbox/message.js b/app/javascript/dashboard/api/inbox/message.js index 8f6a98fad..99adc0302 100644 --- a/app/javascript/dashboard/api/inbox/message.js +++ b/app/javascript/dashboard/api/inbox/message.js @@ -23,9 +23,13 @@ export const buildCreatePayload = ({ files.forEach(file => { payload.append('attachments[]', file); }); - isRecordedAudio?.forEach(filename => { - payload.append('is_recorded_audio[]', filename); - }); + if (isRecordedAudio === true) { + payload.append('is_recorded_audio', true); + } else if (Array.isArray(isRecordedAudio)) { + isRecordedAudio.forEach(filename => { + payload.append('is_recorded_audio[]', filename); + }); + } payload.append('private', isPrivate); payload.append('echo_id', echoId); payload.append('cc_emails', ccEmails); diff --git a/app/javascript/dashboard/components/widgets/WootWriter/AudioRecorder.vue b/app/javascript/dashboard/components/widgets/WootWriter/AudioRecorder.vue index 924e72c53..3c83d5cd9 100644 --- a/app/javascript/dashboard/components/widgets/WootWriter/AudioRecorder.vue +++ b/app/javascript/dashboard/components/widgets/WootWriter/AudioRecorder.vue @@ -4,7 +4,7 @@ import { ref, onMounted, onUnmounted, defineEmits, defineExpose } from 'vue'; import WaveSurfer from 'wavesurfer.js'; import RecordPlugin from 'wavesurfer.js/dist/plugins/record.js'; import { format, intervalToDuration } from 'date-fns'; -import { convertAudio } from './utils/mp3ConversionUtils'; +import { convertAudio } from './utils/audioConversionUtils'; const props = defineProps({ audioRecordFormat: { @@ -18,6 +18,7 @@ const emit = defineEmits([ 'finishRecord', 'pause', 'play', + 'recordError', ]); const waveformContainer = ref(null); @@ -26,6 +27,7 @@ const record = ref(null); const isRecording = ref(false); const isPlaying = ref(false); const hasRecording = ref(false); +const recordedAudioUrl = ref(null); const formatTimeProgress = time => { const duration = intervalToDuration({ start: 0, end: time }); @@ -35,6 +37,28 @@ const formatTimeProgress = time => { ); }; +const AUDIO_EXTENSION_MAP = { + 'audio/ogg': 'ogg', + 'audio/mp3': 'mp3', + 'audio/mpeg': 'mp3', + 'audio/wav': 'wav', + 'audio/webm': 'webm', +}; + +const getRecordPluginOptions = audioFormat => { + const options = { + scrollingWaveform: true, + renderRecordedAudio: false, + }; + if ( + audioFormat === 'audio/ogg' && + MediaRecorder.isTypeSupported('audio/ogg;codecs=opus') + ) { + options.mimeType = 'audio/ogg;codecs=opus'; + } + return options; +}; + const initWaveSurfer = () => { wavesurfer.value = WaveSurfer.create({ container: waveformContainer.value, @@ -45,10 +69,7 @@ const initWaveSurfer = () => { barGap: 1, barRadius: 2, plugins: [ - RecordPlugin.create({ - scrollingWaveform: true, - renderRecordedAudio: false, - }), + RecordPlugin.create(getRecordPluginOptions(props.audioRecordFormat)), ], }); @@ -62,21 +83,29 @@ const initWaveSurfer = () => { }); record.value.on('record-end', async blob => { - const audioUrl = URL.createObjectURL(blob); - const audioBlob = await convertAudio(blob, props.audioRecordFormat); - const fileName = `${getUuid()}.mp3`; - const file = new File([audioBlob], fileName, { - type: props.audioRecordFormat, - }); - wavesurfer.value.load(audioUrl); - emit('finishRecord', { - name: file.name, - type: file.type, - size: file.size, - file, - }); - hasRecording.value = true; - isRecording.value = false; + try { + const audioBlob = await convertAudio(blob, props.audioRecordFormat); + const ext = AUDIO_EXTENSION_MAP[props.audioRecordFormat] || 'mp3'; + const fileName = `${getUuid()}.${ext}`; + const file = new File([audioBlob], fileName, { + type: props.audioRecordFormat, + }); + if (recordedAudioUrl.value) URL.revokeObjectURL(recordedAudioUrl.value); + recordedAudioUrl.value = URL.createObjectURL(audioBlob); + wavesurfer.value.load(recordedAudioUrl.value); + emit('finishRecord', { + name: file.name, + type: file.type, + size: file.size, + file, + }); + hasRecording.value = true; + isRecording.value = false; + } catch (error) { + isRecording.value = false; + hasRecording.value = false; + emit('recordError', { error }); + } }); record.value.on('record-progress', time => { @@ -109,6 +138,10 @@ onMounted(() => { }); onUnmounted(() => { + if (recordedAudioUrl.value) { + URL.revokeObjectURL(recordedAudioUrl.value); + recordedAudioUrl.value = null; + } if (wavesurfer.value) { wavesurfer.value.destroy(); } diff --git a/app/javascript/dashboard/components/widgets/WootWriter/utils/mp3ConversionUtils.js b/app/javascript/dashboard/components/widgets/WootWriter/utils/audioConversionUtils.js similarity index 94% rename from app/javascript/dashboard/components/widgets/WootWriter/utils/mp3ConversionUtils.js rename to app/javascript/dashboard/components/widgets/WootWriter/utils/audioConversionUtils.js index 3ae37911d..a0bfcc1a5 100644 --- a/app/javascript/dashboard/components/widgets/WootWriter/utils/mp3ConversionUtils.js +++ b/app/javascript/dashboard/components/widgets/WootWriter/utils/audioConversionUtils.js @@ -1,5 +1,7 @@ import lamejs from '@breezystack/lamejs'; +import { remuxWebmToOgg } from './webmOpusToOgg'; + const writeString = (view, offset, string) => { // eslint-disable-next-line no-plusplus for (let i = 0; i < string.length; i++) { @@ -135,7 +137,10 @@ export const convertToMp3 = async (audioBlob, bitrate = 128) => { export const convertAudio = async (inputBlob, outputFormat, bitrate = 128) => { let audio; - if (outputFormat === 'audio/wav') { + if (outputFormat === 'audio/ogg') { + // Chrome produces WebM even when OGG is requested; remux to proper OGG/Opus + audio = await remuxWebmToOgg(inputBlob); + } else if (outputFormat === 'audio/wav') { audio = await convertToWav(inputBlob); } else if (outputFormat === 'audio/mp3') { audio = await convertToMp3(inputBlob, bitrate); diff --git a/app/javascript/dashboard/components/widgets/WootWriter/utils/specs/webmOpusToOgg.spec.js b/app/javascript/dashboard/components/widgets/WootWriter/utils/specs/webmOpusToOgg.spec.js new file mode 100644 index 000000000..e514630e3 --- /dev/null +++ b/app/javascript/dashboard/components/widgets/WootWriter/utils/specs/webmOpusToOgg.spec.js @@ -0,0 +1,354 @@ +/* eslint-disable no-bitwise */ +import { describe, it, expect, vi } from 'vitest'; +import { remuxWebmToOgg } from '../webmOpusToOgg'; + +/** + * Helper: build a Blob from a Uint8Array. + * jsdom's Blob may lack .arrayBuffer(), so we polyfill it. + */ +function blobFrom(bytes) { + const blob = new Blob([bytes], { type: 'audio/webm' }); + if (!blob.arrayBuffer) { + blob.arrayBuffer = () => + new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => resolve(reader.result); + reader.onerror = () => reject(reader.error); + reader.readAsArrayBuffer(blob); + }); + } + return blob; +} + +/** + * Safely read a Blob's ArrayBuffer (works even if Blob.arrayBuffer is missing in jsdom). + */ +async function readBlobAsArrayBuffer(blob) { + if (blob.arrayBuffer) { + return blob.arrayBuffer(); + } + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => resolve(reader.result); + reader.onerror = () => reject(reader.error); + reader.readAsArrayBuffer(blob); + }); +} + +// --- EBML element helpers (shared across tests) --- + +function writeVint(value) { + // 1-byte VINT for values 0-126 + if (value < 0x7f) return [0x80 | value]; + // 2-byte VINT for values up to 0x3fff + return [0x40 | ((value >> 8) & 0x3f), value & 0xff]; +} + +function writeId(id) { + if (id <= 0xff) return [id]; + if (id <= 0xffff) return [(id >> 8) & 0xff, id & 0xff]; + if (id <= 0xffffff) return [(id >> 16) & 0xff, (id >> 8) & 0xff, id & 0xff]; + return [(id >> 24) & 0xff, (id >> 16) & 0xff, (id >> 8) & 0xff, id & 0xff]; +} + +function element(id, payload) { + return [...writeId(id), ...writeVint(payload.length), ...payload]; +} + +function masterUnknown(id, children) { + // Unknown size: 0x01 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF (8-byte VINT all-ones) + const childBytes = children.flat(); + return [ + ...writeId(id), + 0x01, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + ...childBytes, + ]; +} + +function master(id, children) { + const childBytes = children.flat(); + return element(id, childBytes); +} + +/** + * Build a minimal valid WebM-like buffer that contains at least one + * SimpleBlock with a synthetic Opus packet. + * + * Layout (simplified EBML): + * EBML Header (master) + * Segment (master, unknown size) + * Tracks (master) + * TrackEntry (master) + * Audio (master) + * Channels (uint, 1) + * SamplingFrequency (float64, 48000.0) + * Cluster (master, unknown size) + * SimpleBlock (track=1, timecode=0, flags=0, opus packet) + */ +function buildMinimalWebM() { + const parts = []; + + // Channels = 1 (element 0x9F, uint8) + const channels = element(0x9f, [1]); + + // SamplingFrequency = 48000.0 (element 0xB5, float64) + const freqBuf = new ArrayBuffer(8); + new DataView(freqBuf).setFloat64(0, 48000.0); + const freqBytes = [...new Uint8Array(freqBuf)]; + const samplingFreq = element(0xb5, freqBytes); + + // Audio master (0xE1) + const audio = master(0xe1, [channels, samplingFreq]); + + // TrackEntry (0xAE) + const trackEntry = master(0xae, [audio]); + + // Tracks (0x1654AE6B) + const tracks = master(0x1654ae6b, [trackEntry]); + + // Build a SimpleBlock (0xA3) + // Track number = 1 (VINT: 0x81), timecode = 0 (int16 BE: 0x00 0x00), flags = 0x00 + // Followed by a synthetic Opus packet (TOC byte = 0xFC → config=31 CELT FB 20ms, code=0 → 1 frame) + const opusPacket = [0xfc, 0x00, 0x01, 0x02, 0x03]; // 5-byte synthetic Opus packet + const simpleBlockPayload = [0x81, 0x00, 0x00, 0x00, ...opusPacket]; // track=1, timecode=0, flags=0 + const simpleBlock = element(0xa3, simpleBlockPayload); + + // Cluster (0x1F43B675) with unknown size + const cluster = masterUnknown(0x1f43b675, [simpleBlock]); + + // Segment (0x18538067) with unknown size + const segment = masterUnknown(0x18538067, [tracks, cluster]); + + // EBML Header (0x1A45DFA3) — minimal + const ebmlHeader = master(0x1a45dfa3, []); + + parts.push(...ebmlHeader, ...segment); + + return new Uint8Array(parts); +} + +describe('remuxWebmToOgg', () => { + it('returns the original Blob when input starts with OggS', async () => { + const oggBytes = new Uint8Array([0x4f, 0x67, 0x67, 0x53, 0x00, 0x01, 0x02]); + const oggBlob = blobFrom(oggBytes); + + const result = await remuxWebmToOgg(oggBlob); + + // Should be the exact same Blob reference (passthrough) + expect(result).toBe(oggBlob); + }); + + it('throws an error when parseWebM yields no frames', async () => { + // An empty Blob (no EBML data, no OggS magic) → parseWebM finds no frames + const emptyBlob = blobFrom(new Uint8Array([0x00, 0x00, 0x00, 0x00])); + + await expect(remuxWebmToOgg(emptyBlob)).rejects.toThrow( + 'No Opus frames found in WebM input' + ); + }); + + it('remuxes a minimal WebM input into valid OGG output', async () => { + const webmBytes = buildMinimalWebM(); + const webmBlob = blobFrom(webmBytes); + + const result = await remuxWebmToOgg(webmBlob); + + expect(result).toBeInstanceOf(Blob); + expect(result.type).toBe('audio/ogg'); + + const outBuf = await readBlobAsArrayBuffer(result); + const outBytes = new Uint8Array(outBuf); + + // Must start with OggS capture pattern + expect(outBytes[0]).toBe(0x4f); // O + expect(outBytes[1]).toBe(0x67); // g + expect(outBytes[2]).toBe(0x67); // g + expect(outBytes[3]).toBe(0x53); // S + + // Count OGG pages (each starts with "OggS") + let pageCount = 0; + for (let i = 0; i <= outBytes.length - 4; i += 1) { + if ( + outBytes[i] === 0x4f && + outBytes[i + 1] === 0x67 && + outBytes[i + 2] === 0x67 && + outBytes[i + 3] === 0x53 + ) { + pageCount += 1; + } + } + + // At least 3 pages: OpusHead (BOS) + OpusTags + audio page(s) + expect(pageCount).toBeGreaterThanOrEqual(3); + }); + + it('has sequential page numbers in OGG output', async () => { + const webmBytes = buildMinimalWebM(); + const webmBlob = blobFrom(webmBytes); + + const result = await remuxWebmToOgg(webmBlob); + const outBuf = await readBlobAsArrayBuffer(result); + const outBytes = new Uint8Array(outBuf); + const dv = new DataView(outBuf); + + // Collect page sequence numbers from OGG pages (offset 18 in each page header) + const pageSeqs = []; + for (let i = 0; i <= outBytes.length - 27; i += 1) { + if ( + outBytes[i] === 0x4f && + outBytes[i + 1] === 0x67 && + outBytes[i + 2] === 0x67 && + outBytes[i + 3] === 0x53 + ) { + pageSeqs.push(dv.getUint32(i + 18, true)); + } + } + + // Pages should be 0, 1, 2, ... + pageSeqs.forEach((seq, idx) => { + expect(seq).toBe(idx); + }); + }); + + it('has the same serial number across all pages', async () => { + const webmBytes = buildMinimalWebM(); + const webmBlob = blobFrom(webmBytes); + + const result = await remuxWebmToOgg(webmBlob); + const outBuf = await readBlobAsArrayBuffer(result); + const outBytes = new Uint8Array(outBuf); + const dv = new DataView(outBuf); + + const serials = []; + for (let i = 0; i <= outBytes.length - 27; i += 1) { + if ( + outBytes[i] === 0x4f && + outBytes[i + 1] === 0x67 && + outBytes[i + 2] === 0x67 && + outBytes[i + 3] === 0x53 + ) { + serials.push(dv.getUint32(i + 14, true)); + } + } + + // All pages share the same serial + const unique = [...new Set(serials)]; + expect(unique).toHaveLength(1); + }); + + it('first page contains OpusHead', async () => { + const webmBytes = buildMinimalWebM(); + const webmBlob = blobFrom(webmBytes); + + const result = await remuxWebmToOgg(webmBlob); + const outBuf = await readBlobAsArrayBuffer(result); + const outBytes = new Uint8Array(outBuf); + + // First page is BOS (header_type byte at offset 5 has bit 0x02 set) + expect(outBytes[5] & 0x02).toBe(0x02); + + // Find the segment data in first page and check for OpusHead magic + const numSegments = outBytes[26]; + const dataStart = 27 + numSegments; + const magic = new TextDecoder().decode( + outBytes.slice(dataStart, dataStart + 8) + ); + expect(magic).toBe('OpusHead'); + }); + + it('second page contains OpusTags', async () => { + const webmBytes = buildMinimalWebM(); + const webmBlob = blobFrom(webmBytes); + + const result = await remuxWebmToOgg(webmBlob); + const outBuf = await readBlobAsArrayBuffer(result); + const outBytes = new Uint8Array(outBuf); + + // Find second OggS page + const pageStarts = []; + for (let i = 0; i <= outBytes.length - 4; i += 1) { + if ( + outBytes[i] === 0x4f && + outBytes[i + 1] === 0x67 && + outBytes[i + 2] === 0x67 && + outBytes[i + 3] === 0x53 + ) { + pageStarts.push(i); + } + } + + expect(pageStarts.length).toBeGreaterThanOrEqual(2); + const page2Start = pageStarts[1]; + const numSegments = outBytes[page2Start + 26]; + const dataStart = page2Start + 27 + numSegments; + const magic = new TextDecoder().decode( + outBytes.slice(dataStart, dataStart + 8) + ); + expect(magic).toBe('OpusTags'); + }); + + it('last page has EOS flag set', async () => { + const webmBytes = buildMinimalWebM(); + const webmBlob = blobFrom(webmBytes); + + const result = await remuxWebmToOgg(webmBlob); + const outBuf = await readBlobAsArrayBuffer(result); + const outBytes = new Uint8Array(outBuf); + + // Find the last OggS page + let lastPageStart = -1; + for (let i = 0; i <= outBytes.length - 4; i += 1) { + if ( + outBytes[i] === 0x4f && + outBytes[i + 1] === 0x67 && + outBytes[i + 2] === 0x67 && + outBytes[i + 3] === 0x53 + ) { + lastPageStart = i; + } + } + + expect(lastPageStart).toBeGreaterThan(0); + // EOS flag = 0x04 + expect(outBytes[lastPageStart + 5] & 0x04).toBe(0x04); + }); + + it('logs warning for laced SimpleBlock', async () => { + const consoleSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + // Build a WebM with a laced SimpleBlock (flags byte with lacing bits set) + const parts = []; + + const tracks = master(0x1654ae6b, [ + master(0xae, [master(0xe1, [element(0x9f, [1])])]), + ]); + + // SimpleBlock with lacing bits set (flags = 0x06 → Xiph lacing) + const opusPacket = [0xfc, 0x00, 0x01]; + const simpleBlockPayload = [0x81, 0x00, 0x00, 0x06, ...opusPacket]; + const simpleBlock = element(0xa3, simpleBlockPayload); + const cluster = masterUnknown(0x1f43b675, [simpleBlock]); + const segment = masterUnknown(0x18538067, [tracks, cluster]); + const ebmlHeader = master(0x1a45dfa3, []); + + parts.push(...ebmlHeader, ...segment); + + const webmBlob = blobFrom(new Uint8Array(parts)); + // Should still produce output (not crash), but warn + const result = await remuxWebmToOgg(webmBlob); + expect(result).toBeInstanceOf(Blob); + + expect(consoleSpy).toHaveBeenCalledWith( + expect.stringContaining('laced SimpleBlock detected') + ); + + consoleSpy.mockRestore(); + }); +}); diff --git a/app/javascript/dashboard/components/widgets/WootWriter/utils/webmOpusToOgg.js b/app/javascript/dashboard/components/widgets/WootWriter/utils/webmOpusToOgg.js new file mode 100644 index 000000000..4195b30e2 --- /dev/null +++ b/app/javascript/dashboard/components/widgets/WootWriter/utils/webmOpusToOgg.js @@ -0,0 +1,454 @@ +/* eslint-disable no-bitwise */ +/** + * WebM/Opus → OGG/Opus remuxer + * + * Chrome's MediaRecorder produces WebM containers even when + * `audio/ogg;codecs=opus` is requested. WhatsApp Cloud API requires + * proper OGG/Opus files for voice messages. + * + * This module extracts raw Opus packets from the WebM (EBML) container + * and repackages them into a valid OGG bitstream. The audio data itself + * is never re-encoded — only the container format changes. + * + * References: + * EBML (container for WebM): RFC 8794 — https://www.rfc-editor.org/rfc/rfc8794 + * Matroska/WebM elements: https://www.matroska.org/technical/elements.html + * OGG bitstream framing: RFC 3533 — https://www.rfc-editor.org/rfc/rfc3533 + * Opus codec: RFC 6716 — https://www.rfc-editor.org/rfc/rfc6716 + * Opus in OGG (OpusHead/Tags): RFC 7845 — https://www.rfc-editor.org/rfc/rfc7845 + */ + +// ======================== EBML / WebM parser ======================== + +const EBML_IDS = { + Segment: 0x18538067, + SegmentInfo: 0x1549a966, + Tracks: 0x1654ae6b, + TrackEntry: 0xae, + CodecPrivate: 0x63a2, + Audio: 0xe1, + SamplingFrequency: 0xb5, + Channels: 0x9f, + Cluster: 0x1f43b675, + Timecode: 0xe7, + SimpleBlock: 0xa3, + BlockGroup: 0xa0, + Block: 0xa1, +}; + +const MASTER_ELEMENTS = new Set([ + 0x1a45dfa3, // EBML header + EBML_IDS.Segment, + EBML_IDS.SegmentInfo, + EBML_IDS.Tracks, + EBML_IDS.TrackEntry, + EBML_IDS.Audio, + EBML_IDS.Cluster, + EBML_IDS.BlockGroup, +]); + +/** Read an EBML variable-length integer (data size). */ +function readVint(data, pos) { + if (pos >= data.length) return null; + const first = data[pos]; + if (first === 0) return null; + + let len = 1; + let mask = 0x80; + while (len <= 8 && !(first & mask)) { + len += 1; + mask >>= 1; + } + if (len > 8 || pos + len > data.length) return null; + + let value = first & (mask - 1); + for (let i = 1; i < len; i += 1) { + value = value * 256 + data[pos + i]; + } + return { value, length: len }; +} + +/** Read an EBML element ID (leading marker bits are kept). */ +function readElementId(data, pos) { + if (pos >= data.length) return null; + const first = data[pos]; + if (first === 0) return null; + + let len = 1; + let mask = 0x80; + while (len <= 4 && !(first & mask)) { + len += 1; + mask >>= 1; + } + if (len > 4 || pos + len > data.length) return null; + + let id = first; + for (let i = 1; i < len; i += 1) { + id = id * 256 + data[pos + i]; + } + return { id, length: len }; +} + +function readUintBE(data, offset, length) { + let v = 0; + for (let i = 0; i < length; i += 1) v = v * 256 + data[offset + i]; + return v; +} + +function readFloatBE(data, offset, length) { + if (length !== 4 && length !== 8) return NaN; + const buf = new ArrayBuffer(length); + const u8 = new Uint8Array(buf); + for (let i = 0; i < length; i += 1) u8[i] = data[offset + i]; + const view = new DataView(buf); + return length === 4 ? view.getFloat32(0) : view.getFloat64(0); +} + +/** Extract the raw Opus frame from a SimpleBlock / Block element. */ +function extractFrameFromBlock(data, offset, end) { + const trackVint = readVint(data, offset); + if (!trackVint) return null; + let pos = offset + trackVint.length; + + // int16 relative timecode (big-endian, signed) – skip + pos += 2; + // Flags byte – skip. Lacing (Xiph/EBML/fixed-size) is NOT supported; + // this assumes single-frame blocks as produced by MediaRecorder. + const flags = data[pos]; + const lacingBits = (flags >> 1) & 0x03; + if (lacingBits !== 0) { + // eslint-disable-next-line no-console + console.warn( + 'webmOpusToOgg: laced SimpleBlock detected (unsupported), frame may be invalid' + ); + } + pos += 1; + + if (pos >= end) return null; + return data.slice(pos, end); +} + +/** + * Walk the EBML tree and collect metadata + Opus frames. + * We only descend into master elements and only extract the fields we need. + */ +function parseWebM(buffer) { + const data = new Uint8Array(buffer); + const result = { + channels: 1, + sampleRate: 48000, + codecPrivate: null, + frames: [], + }; + + function walk(start, end) { + let pos = start; + while (pos < end) { + const idRes = readElementId(data, pos); + if (!idRes) break; + pos += idRes.length; + + const sizeRes = readVint(data, pos); + if (!sizeRes) break; + pos += sizeRes.length; + + // Handle "unknown size" (all-ones VINT) by treating it as the rest of the parent + // Use Math.pow instead of bit-shift to avoid 32-bit overflow for 5+ byte VINTs + const maxVint = 2 ** (7 * sizeRes.length) - 1; + const elEnd = + sizeRes.value === maxVint ? end : Math.min(pos + sizeRes.value, end); + + if (MASTER_ELEMENTS.has(idRes.id)) { + walk(pos, elEnd); + } else { + switch (idRes.id) { + case EBML_IDS.Channels: + result.channels = readUintBE(data, pos, sizeRes.value); + break; + case EBML_IDS.SamplingFrequency: + result.sampleRate = readFloatBE(data, pos, sizeRes.value); + break; + case EBML_IDS.CodecPrivate: + result.codecPrivate = data.slice(pos, elEnd); + break; + case EBML_IDS.SimpleBlock: + case EBML_IDS.Block: { + const frame = extractFrameFromBlock(data, pos, elEnd); + if (frame && frame.length > 0) result.frames.push(frame); + break; + } + default: + break; + } + } + pos = elEnd; + } + } + + walk(0, data.length); + return result; +} + +// ======================== OGG writer ======================== + +/** OGG CRC-32 table (polynomial 0x04C11DB7). */ +const CRC_TABLE = (() => { + const t = new Uint32Array(256); + for (let i = 0; i < 256; i += 1) { + let c = i << 24; + for (let j = 0; j < 8; j += 1) { + c = ((c << 1) ^ (c & 0x80000000 ? 0x04c11db7 : 0)) >>> 0; + } + t[i] = c; + } + return t; +})(); + +function oggCrc32(bytes) { + let crc = 0; + for (let i = 0; i < bytes.length; i += 1) { + crc = (CRC_TABLE[((crc >>> 24) ^ bytes[i]) & 0xff] ^ (crc << 8)) >>> 0; + } + return crc; +} + +/** + * Build one OGG page. + * + * @param {number} headerType 0x02 = BOS, 0x04 = EOS, 0x00 = normal + * @param {number} granulePosition 48 kHz sample count + * @param {number} serialNumber logical stream id + * @param {number} pageSeq page sequence counter + * @param {Uint8Array[]} packets one or more complete Opus packets + */ +function createOggPage( + headerType, + granulePosition, + serialNumber, + pageSeq, + packets +) { + // Build the lacing / segment table + const segTable = []; + let dataLen = 0; + packets.forEach(pkt => { + let rem = pkt.length; + while (rem >= 255) { + segTable.push(255); + rem -= 255; + } + segTable.push(rem); // final segment (0 when pkt.length is a multiple of 255) + dataLen += pkt.length; + }); + + const hdrLen = 27 + segTable.length; + const page = new Uint8Array(hdrLen + dataLen); + const dv = new DataView(page.buffer); + + // Capture pattern + page.set([0x4f, 0x67, 0x67, 0x53]); // "OggS" + page[4] = 0; // version + page[5] = headerType; + + // Granule position (int64 LE) + dv.setUint32(6, granulePosition & 0xffffffff, true); + dv.setUint32( + 10, + Math.floor(granulePosition / 0x100000000) & 0xffffffff, + true + ); + + dv.setUint32(14, serialNumber, true); // serial + dv.setUint32(18, pageSeq, true); // page sequence + dv.setUint32(22, 0, true); // CRC placeholder + + page[26] = segTable.length; + for (let i = 0; i < segTable.length; i += 1) page[27 + i] = segTable[i]; + + let off = hdrLen; + packets.forEach(pkt => { + page.set(pkt, off); + off += pkt.length; + }); + + // Fill in the CRC + dv.setUint32(22, oggCrc32(page), true); + return page; +} + +// ======================== Opus helpers ======================== + +/** Lookup table: frame duration in ms for each Opus TOC config index (0-31). */ +const OPUS_FRAME_MS = [ + 10, + 20, + 40, + 60, // 0-3 SILK NB + 10, + 20, + 40, + 60, // 4-7 SILK MB + 10, + 20, + 40, + 60, // 8-11 SILK WB + 10, + 20, // 12-13 Hybrid SWB + 10, + 20, // 14-15 Hybrid FB + 2.5, + 5, + 10, + 20, // 16-19 CELT NB + 2.5, + 5, + 10, + 20, // 20-23 CELT WB + 2.5, + 5, + 10, + 20, // 24-27 CELT SWB + 2.5, + 5, + 10, + 20, // 28-31 CELT FB +]; + +/** Return the total number of 48 kHz PCM samples represented by an Opus packet. */ +function opusPacketSamples(pkt) { + if (!pkt || pkt.length === 0) return 960; // default 20 ms + const toc = pkt[0]; + const config = (toc >> 3) & 0x1f; + const code = toc & 0x03; + + const samplesPerFrame = ((OPUS_FRAME_MS[config] || 20) * 48000) / 1000; + let frameCount; + if (code <= 1) frameCount = code + 1; + else if (code === 2) frameCount = 2; + else frameCount = pkt.length >= 2 ? pkt[1] & 0x3f : 1; + + return samplesPerFrame * frameCount; +} + +function buildOpusHead(channels, sampleRate, preSkip) { + const buf = new Uint8Array(19); + const dv = new DataView(buf.buffer); + buf.set(new TextEncoder().encode('OpusHead')); + buf[8] = 1; // version + buf[9] = channels; + dv.setUint16(10, preSkip, true); + dv.setUint32(12, sampleRate, true); + dv.setInt16(16, 0, true); // output gain + buf[18] = 0; // channel mapping family + return buf; +} + +function buildOpusTags() { + const vendor = new TextEncoder().encode('chatwoot'); + const buf = new Uint8Array(8 + 4 + vendor.length + 4); + const dv = new DataView(buf.buffer); + buf.set(new TextEncoder().encode('OpusTags')); + dv.setUint32(8, vendor.length, true); + buf.set(vendor, 12); + dv.setUint32(12 + vendor.length, 0, true); // 0 user comments + return buf; +} + +// ======================== Public API ======================== + +const MAX_FRAMES_PER_PAGE = 50; // ~1 s at 20 ms/frame +const MAX_SEGMENTS_PER_PAGE = 255; + +/** + * Remux a WebM/Opus blob into an OGG/Opus blob. + * If the input is already OGG (starts with "OggS"), it is returned as-is. + * + * @param {Blob} webmBlob + * @returns {Promise} OGG/Opus blob + */ +export async function remuxWebmToOgg(webmBlob) { + const buffer = await webmBlob.arrayBuffer(); + const bytes = new Uint8Array(buffer); + + // Already OGG? Return unchanged. + if ( + bytes.length >= 4 && + bytes[0] === 0x4f && + bytes[1] === 0x67 && + bytes[2] === 0x67 && + bytes[3] === 0x53 + ) { + return webmBlob; + } + + const { channels, sampleRate, codecPrivate, frames } = parseWebM(buffer); + if (frames.length === 0) { + throw new Error('No Opus frames found in WebM input'); + } + + // Extract pre-skip from the WebM CodecPrivate (which IS the OpusHead) + let preSkip = 312; + if (codecPrivate && codecPrivate.length >= 12) { + const magic = new TextDecoder().decode(codecPrivate.slice(0, 8)); + if (magic === 'OpusHead') { + preSkip = new DataView( + codecPrivate.buffer, + codecPrivate.byteOffset, + codecPrivate.length + ).getUint16(10, true); + } + } + + const serial = (Math.random() * 0x100000000) >>> 0; + let pageSeq = 0; + const pages = []; + + // Page 0 – OpusHead (BOS) + pages.push( + createOggPage(0x02, 0, serial, pageSeq, [ + buildOpusHead(channels, sampleRate, preSkip), + ]) + ); + pageSeq += 1; + + // Page 1 – OpusTags + pages.push(createOggPage(0x00, 0, serial, pageSeq, [buildOpusTags()])); + pageSeq += 1; + + // Audio pages + let granule = 0; + let idx = 0; + + while (idx < frames.length) { + const packets = []; + let segs = 0; + + while (idx < frames.length && packets.length < MAX_FRAMES_PER_PAGE) { + const pkt = frames[idx]; + const pktSegs = Math.ceil(pkt.length / 255) || 1; + if (segs + pktSegs > MAX_SEGMENTS_PER_PAGE && packets.length > 0) break; + + packets.push(pkt); + segs += pktSegs; + granule += opusPacketSamples(pkt); + idx += 1; + } + + const isLast = idx >= frames.length; + pages.push( + createOggPage(isLast ? 0x04 : 0x00, granule, serial, pageSeq, packets) + ); + pageSeq += 1; + } + + // Concatenate pages into a single buffer + const total = pages.reduce((s, p) => s + p.length, 0); + const out = new Uint8Array(total); + let off = 0; + pages.forEach(p => { + out.set(p, off); + off += p.length; + }); + + return new Blob([out], { type: 'audio/ogg' }); +} diff --git a/app/javascript/dashboard/components/widgets/conversation/ReplyBox.vue b/app/javascript/dashboard/components/widgets/conversation/ReplyBox.vue index db384b6c7..442e2648b 100644 --- a/app/javascript/dashboard/components/widgets/conversation/ReplyBox.vue +++ b/app/javascript/dashboard/components/widgets/conversation/ReplyBox.vue @@ -360,6 +360,9 @@ export default { return `draft-${this.conversationIdByRoute}-${this.replyType}`; }, audioRecordFormat() { + if (this.isAWhatsAppCloudChannel) { + return AUDIO_FORMATS.OGG; + } if (this.isAWhatsAppChannel || this.isATelegramChannel) { return AUDIO_FORMATS.MP3; } @@ -1027,6 +1030,10 @@ export default { }; return file && this.onFileUpload(autoRecordedFile); }, + onRecordError() { + this.toggleAudioRecorder(); + useAlert(this.$t('CONVERSATION.REPLYBOX.AUDIO_CONVERSION_FAILED')); + }, toggleTyping(status) { const conversationId = this.currentChat.id; const isPrivate = this.isPrivate; @@ -1094,6 +1101,13 @@ export default { sender: this.sender, }; + if (attachment.isRecordedAudio) { + attachmentPayload.isRecordedAudio = this.globalConfig + .directUploadsEnabled + ? true + : [attachment.resource.file.name]; + } + attachmentPayload = this.setReplyToInPayload(attachmentPayload); multipleMessagePayload.push(attachmentPayload); // For WhatsApp, only the first attachment gets a caption @@ -1142,6 +1156,9 @@ export default { this.attachedFiles.forEach(attachment => { if (this.globalConfig.directUploadsEnabled) { messagePayload.files.push(attachment.blobSignedId); + if (attachment.isRecordedAudio) { + messagePayload.isRecordedAudio = true; + } } else { messagePayload.files.push(attachment.resource.file); if (attachment.isRecordedAudio) { @@ -1304,6 +1321,7 @@ export default { :audio-record-format="audioRecordFormat" @recorder-progress-changed="onRecordProgressChanged" @finish-record="onFinishRecorder" + @record-error="onRecordError" @play="recordingAudioState = 'playing'" @pause="recordingAudioState = 'paused'" /> diff --git a/app/javascript/dashboard/i18n/locale/en/conversation.json b/app/javascript/dashboard/i18n/locale/en/conversation.json index 8778c0cce..a38c9dc2f 100644 --- a/app/javascript/dashboard/i18n/locale/en/conversation.json +++ b/app/javascript/dashboard/i18n/locale/en/conversation.json @@ -213,6 +213,7 @@ "TIP_AUDIORECORDER_ICON": "Record audio", "TIP_AUDIORECORDER_PERMISSION": "Allow access to audio", "TIP_AUDIORECORDER_ERROR": "Could not open the audio", + "AUDIO_CONVERSION_FAILED": "Audio conversion failed. Please try again.", "DRAG_DROP": "Drag and drop here to attach", "START_AUDIO_RECORDING": "Start audio recording", "STOP_AUDIO_RECORDING": "Stop audio recording", diff --git a/app/javascript/dashboard/i18n/locale/pt_BR/conversation.json b/app/javascript/dashboard/i18n/locale/pt_BR/conversation.json index 0826224e6..99fa128f7 100644 --- a/app/javascript/dashboard/i18n/locale/pt_BR/conversation.json +++ b/app/javascript/dashboard/i18n/locale/pt_BR/conversation.json @@ -205,6 +205,7 @@ "TIP_AUDIORECORDER_ICON": "Gravar áudio", "TIP_AUDIORECORDER_PERMISSION": "Permitir acesso ao áudio", "TIP_AUDIORECORDER_ERROR": "Não foi possível abrir o áudio", + "AUDIO_CONVERSION_FAILED": "Falha na conversão do áudio. Tente novamente.", "DRAG_DROP": "Arraste e solte aqui para anexar", "START_AUDIO_RECORDING": "Iniciar gravação de áudio", "STOP_AUDIO_RECORDING": "Parar gravação de áudio", diff --git a/app/services/audio/transcode_service.rb b/app/services/audio/transcode_service.rb new file mode 100644 index 000000000..5a335d60d --- /dev/null +++ b/app/services/audio/transcode_service.rb @@ -0,0 +1,78 @@ +class Audio::TranscodeService + SUPPORTED_FORMATS = { 'opus' => { codec: 'libopus', extension: 'ogg', content_type: 'audio/ogg' } }.freeze + + def initialize(attachment, target_format, source_file: nil) + @attachment = attachment + @target_format = target_format + @source_file = source_file + end + + def perform + validate_format! + return if already_in_target_format? + + transcode_attachment + end + + private + + def already_in_target_format? + format_config = SUPPORTED_FORMATS[@target_format] + content_type = @attachment.file.content_type + return true if content_type == format_config[:content_type] + + # Marcel may detect Opus-in-OGG as audio/opus; treat as already in target format + # when transcoding to Opus to avoid unnecessary re-transcoding + @target_format == 'opus' && content_type == 'audio/opus' + end + + def validate_format! + return if SUPPORTED_FORMATS.key?(@target_format) + + raise CustomExceptions::Audio::UnsupportedFormatError, + "Unsupported transcode format: #{@target_format}. Supported: #{SUPPORTED_FORMATS.keys.join(', ')}" + end + + def transcode_attachment + format_config = SUPPORTED_FORMATS[@target_format] + input_file = nil + output_file = nil + input_file = download_to_tempfile + output_file = Tempfile.new(['transcoded', ".#{format_config[:extension]}"]) + movie = FFMPEG::Movie.new(input_file.path) + raise CustomExceptions::Audio::TranscodingError, 'Invalid or unreadable audio file' unless movie.valid? + + movie.transcode(output_file.path, audio_codec: format_config[:codec], custom: %w[-vn -map_metadata -1]) + replace_attachment_file(output_file, format_config) + rescue FFMPEG::Error => e + raise CustomExceptions::Audio::TranscodingError, "FFmpeg transcoding failed: #{e.message}" + ensure + input_file&.close! + output_file&.close! + end + + def download_to_tempfile + tempfile = Tempfile.new(['original_audio', File.extname(@attachment.file.filename.to_s)]) + tempfile.binmode + if @source_file && (@source_file.respond_to?(:tempfile) || @source_file.respond_to?(:path)) + source_path = @source_file.respond_to?(:tempfile) ? @source_file.tempfile.path : @source_file.path + IO.copy_stream(source_path, tempfile) + else + @attachment.file.blob.open { |file| IO.copy_stream(file, tempfile) } + end + tempfile.rewind + tempfile + end + + def replace_attachment_file(output_file, format_config) + filename = "#{File.basename(@attachment.file.filename.to_s, '.*')}.#{format_config[:extension]}" + File.open(output_file.path, 'rb') do |file| + @attachment.file.attach( + io: file, + filename: filename, + content_type: format_config[:content_type] + ) + end + @attachment.file_type = :audio + end +end diff --git a/app/services/whatsapp/providers/whatsapp_cloud_service.rb b/app/services/whatsapp/providers/whatsapp_cloud_service.rb index 4696fdbfa..0acc888f3 100644 --- a/app/services/whatsapp/providers/whatsapp_cloud_service.rb +++ b/app/services/whatsapp/providers/whatsapp_cloud_service.rb @@ -156,14 +156,12 @@ class Whatsapp::Providers::WhatsappCloudService < Whatsapp::Providers::BaseServi def send_attachment_message(phone_number, message) attachment = message.attachments.first + normalize_opus_content_type(attachment) type = %w[image audio video].include?(attachment.file_type) ? attachment.file_type : 'document' - type_content = { - 'link': attachment.download_url - } + type_content = { 'link' => attachment.download_url } type_content['caption'] = message.outgoing_content unless %w[audio sticker].include?(type) type_content['filename'] = attachment.file.filename if type == 'document' - # FIXME: This requires transcoding to opus/ogg. - # type_content['voice'] = true if type == 'audio' && attachment.meta&.dig('is_recorded_audio') + type_content['voice'] = true if voice_message?(type, attachment) response = HTTParty.post( "#{phone_id_path('v24.0')}/messages", headers: api_headers, @@ -179,6 +177,25 @@ class Whatsapp::Providers::WhatsappCloudService < Whatsapp::Providers::BaseServi process_response(response, message) end + def voice_message?(type, attachment) + type == 'audio' && attachment.meta&.dig('is_recorded_audio') && attachment.file.content_type == 'audio/ogg' + end + + # Marcel gem may re-detect OGG/Opus files as audio/opus after ActiveStorage + # blob attachment, but WhatsApp Cloud API requires audio/ogg content type + # for voice messages. Normalize so the download URL serves the correct + # Content-Type header. No-op when the frontend already uploads as audio/ogg. + def normalize_opus_content_type(attachment) + return unless attachment.file.attached? + + blob = attachment.file.blob + return unless blob.content_type == 'audio/opus' + + return if blob.update(content_type: 'audio/ogg') + + Rails.logger.error("Failed to normalize blob #{blob.id} content_type from audio/opus to audio/ogg") + end + def error_message(response) # https://developers.facebook.com/docs/whatsapp/cloud-api/support/error-codes/#sample-response response.parsed_response&.dig('error', 'message') diff --git a/docker/Dockerfile b/docker/Dockerfile index 645a61a55..ac0435927 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -128,6 +128,7 @@ RUN apk update && apk add --no-cache \ imagemagick \ git \ vips \ + ffmpeg \ && gem install bundler -v "$BUNDLER_VERSION" COPY --from=node /usr/local/bin/node /usr/local/bin/ diff --git a/lib/custom_exceptions/audio.rb b/lib/custom_exceptions/audio.rb new file mode 100644 index 000000000..7932e7801 --- /dev/null +++ b/lib/custom_exceptions/audio.rb @@ -0,0 +1,13 @@ +module CustomExceptions::Audio + class UnsupportedFormatError < CustomExceptions::Base + def message + @data + end + end + + class TranscodingError < CustomExceptions::Base + def message + @data + end + end +end diff --git a/spec/builders/messages/message_builder_spec.rb b/spec/builders/messages/message_builder_spec.rb index a2922a0c5..76a2388b7 100644 --- a/spec/builders/messages/message_builder_spec.rb +++ b/spec/builders/messages/message_builder_spec.rb @@ -186,6 +186,50 @@ describe Messages::MessageBuilder do }) end + context 'when transcode_audio is set' do + let(:params) do + ActionController::Parameters.new({ + content: 'test', + transcode_audio: 'opus', + attachments: [Rack::Test::UploadedFile.new('spec/assets/sample.mp3', 'audio/mpeg')] + }) + end + + it 'transcodes audio attachment and sets is_recorded_audio metadata' do + service_instance = instance_double(Audio::TranscodeService) + allow(Audio::TranscodeService).to receive(:new).and_return(service_instance) + allow(service_instance).to receive(:perform) + + message = message_builder + + expect(Audio::TranscodeService).to have_received(:new) + expect(service_instance).to have_received(:perform) + expect(message.attachments.first.meta).to include('is_recorded_audio' => true) + end + + it 'does not transcode non-audio attachments' do + allow(Audio::TranscodeService).to receive(:new) + params[:attachments] = [Rack::Test::UploadedFile.new('spec/assets/avatar.png', 'image/png')] + + message = message_builder + + expect(Audio::TranscodeService).not_to have_received(:new) + expect(message.attachments.first.file_type).to eq 'image' + end + end + + context 'when transcode_audio is not set' do + it 'does not invoke transcoding service' do + allow(Audio::TranscodeService).to receive(:new) + params[:attachments] = [Rack::Test::UploadedFile.new('spec/assets/sample.mp3', 'audio/mpeg')] + + message = message_builder + + expect(Audio::TranscodeService).not_to have_received(:new) + expect(message.attachments.first.file_type).to eq 'audio' + end + end + context 'when DIRECT_UPLOAD_ENABLED' do let(:params) do ActionController::Parameters.new({ diff --git a/spec/services/audio/transcode_service_spec.rb b/spec/services/audio/transcode_service_spec.rb new file mode 100644 index 000000000..aa20e13a0 --- /dev/null +++ b/spec/services/audio/transcode_service_spec.rb @@ -0,0 +1,106 @@ +require 'rails_helper' + +RSpec.describe Audio::TranscodeService do + let(:message) { create(:message) } + let(:attachment) do + attachment = message.attachments.new(account_id: message.account_id, file_type: :audio) + attachment.file.attach(io: Rails.root.join('spec/assets/sample.mp3').open, filename: 'sample.mp3', content_type: 'audio/mpeg') + attachment.save! + attachment + end + + describe '#perform' do + context 'with unsupported format' do + it 'raises UnsupportedFormatError' do + error = nil + begin + described_class.new(attachment, 'aac').perform + rescue StandardError => e + error = e + end + + expect(error).not_to be_nil + expect(error.class.name).to eq('CustomExceptions::Audio::UnsupportedFormatError') + expect(error.message).to match(/Unsupported transcode format: aac/) + end + end + + context 'with opus format' do + it 'skips transcoding when file is already in target format' do + ogg_attachment = message.attachments.new(account_id: message.account_id, file_type: :audio) + ogg_attachment.file.attach(io: StringIO.new('ogg_data'), filename: 'recording.ogg', content_type: 'audio/ogg') + ogg_attachment.save! + + allow(FFMPEG::Movie).to receive(:new) + + described_class.new(ogg_attachment, 'opus').perform + + expect(FFMPEG::Movie).not_to have_received(:new) + expect(ogg_attachment.file.content_type).to eq('audio/ogg') + end + + it 'transcodes audio to ogg/opus format' do + mock_movie = instance_double(FFMPEG::Movie, valid?: true) + allow(FFMPEG::Movie).to receive(:new).and_return(mock_movie) + allow(mock_movie).to receive(:transcode) do |output_path, _options| + File.write(output_path, 'fake_opus_data') + end + + described_class.new(attachment, 'opus').perform + + expect(attachment.file.filename.to_s).to eq('sample.ogg') + expect(attachment.file.content_type).to eq('audio/ogg') + expect(attachment.file_type).to eq('audio') + end + + it 'transcodes using source_file when provided' do + uploaded_file = Rack::Test::UploadedFile.new(Rails.root.join('spec/assets/sample.mp3').to_s, 'audio/mpeg') + + mock_movie = instance_double(FFMPEG::Movie, valid?: true) + allow(FFMPEG::Movie).to receive(:new).and_return(mock_movie) + allow(mock_movie).to receive(:transcode) do |output_path, _options| + File.write(output_path, 'fake_opus_data') + end + + described_class.new(attachment, 'opus', source_file: uploaded_file).perform + + expect(attachment.file.filename.to_s).to eq('sample.ogg') + expect(attachment.file.content_type).to eq('audio/ogg') + expect(attachment.file_type).to eq('audio') + end + + it 'raises TranscodingError when the audio file is invalid' do + mock_movie = instance_double(FFMPEG::Movie, valid?: false) + allow(FFMPEG::Movie).to receive(:new).and_return(mock_movie) + + error = nil + begin + described_class.new(attachment, 'opus').perform + rescue StandardError => e + error = e + end + + expect(error).not_to be_nil + expect(error.class.name).to eq('CustomExceptions::Audio::TranscodingError') + expect(error.message).to match(/Invalid or unreadable audio file/) + end + + it 'raises TranscodingError when FFmpeg fails' do + mock_movie = instance_double(FFMPEG::Movie, valid?: true) + allow(FFMPEG::Movie).to receive(:new).and_return(mock_movie) + allow(mock_movie).to receive(:transcode).and_raise(FFMPEG::Error, 'encoding failed') + + error = nil + begin + described_class.new(attachment, 'opus').perform + rescue StandardError => e + error = e + end + + expect(error).not_to be_nil + expect(error.class.name).to eq('CustomExceptions::Audio::TranscodingError') + expect(error.message).to match(/FFmpeg transcoding failed/) + end + end + end +end diff --git a/spec/services/whatsapp/providers/whatsapp_cloud_service_spec.rb b/spec/services/whatsapp/providers/whatsapp_cloud_service_spec.rb index 566a85eea..8655978de 100644 --- a/spec/services/whatsapp/providers/whatsapp_cloud_service_spec.rb +++ b/spec/services/whatsapp/providers/whatsapp_cloud_service_spec.rb @@ -109,23 +109,43 @@ describe Whatsapp::Providers::WhatsappCloudService do expect(service.send_message('+123456789', message)).to eq 'message_id' end - # FIXME: This requires transcoding to opus/ogg. - # it 'calls message endpoints with voice flag for recorded audio attachment' do - # attachment = message.attachments.new(account_id: message.account_id, file_type: :audio, meta: { 'is_recorded_audio' => true }) - # attachment.file.attach(io: Rails.root.join('spec/assets/sample.mp3').open, filename: 'sample.mp3', content_type: 'audio/mpeg') + it 'does not send voice flag for recorded audio in non-ogg format' do + attachment = message.attachments.new(account_id: message.account_id, file_type: :audio, meta: { 'is_recorded_audio' => true }) + attachment.file.attach(io: Rails.root.join('spec/assets/sample.mp3').open, filename: 'sample.mp3', content_type: 'audio/mpeg') - # stub_request(:post, 'https://graph.facebook.com/v24.0/123456789/messages') - # .with( - # body: hash_including({ - # messaging_product: 'whatsapp', - # to: '+123456789', - # type: 'audio', - # audio: WebMock::API.hash_including({ link: anything, voice: true }) - # }) - # ) - # .to_return(status: 200, body: whatsapp_response.to_json, headers: response_headers) - # expect(service.send_message('+123456789', message)).to eq 'message_id' - # end + stub_request(:post, 'https://graph.facebook.com/v24.0/123456789/messages') + .with( + body: hash_including({ + messaging_product: 'whatsapp', + to: '+123456789', + type: 'audio', + audio: WebMock::API.hash_including({ link: anything }) + }) + ) + .to_return(status: 200, body: whatsapp_response.to_json, headers: response_headers) + + # Ensure voice flag is NOT present for non-ogg audio + expect(service.send_message('+123456789', message)).to eq 'message_id' + expect(WebMock).not_to(have_requested(:post, 'https://graph.facebook.com/v24.0/123456789/messages') + .with { |req| JSON.parse(req.body).dig('audio', 'voice') }) + end + + it 'sends voice flag for recorded audio in ogg format' do + attachment = message.attachments.new(account_id: message.account_id, file_type: :audio, meta: { 'is_recorded_audio' => true }) + attachment.file.attach(io: Rails.root.join('spec/assets/sample.ogg').open, filename: 'sample.ogg', content_type: 'audio/ogg') + + stub_request(:post, 'https://graph.facebook.com/v24.0/123456789/messages') + .with( + body: hash_including({ + messaging_product: 'whatsapp', + to: '+123456789', + type: 'audio', + audio: WebMock::API.hash_including({ link: anything, voice: true }) + }) + ) + .to_return(status: 200, body: whatsapp_response.to_json, headers: response_headers) + expect(service.send_message('+123456789', message)).to eq 'message_id' + end end end