-
-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Implement fast WAV ImaAdpcm decoder #22139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: bleed
Are you sure you want to change the base?
Changes from all commits
31e86be
3c85720
a2826de
6165ddd
f48e006
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -172,5 +172,20 @@ public static Stream CreateWithoutOwningStream(Stream stream, long offset, int c | |
| stream.Seek(offset, SeekOrigin.Begin); | ||
| return new MemoryStream(stream.ReadBytes(count)); | ||
| } | ||
|
|
||
| public static ReadOnlySpan<byte> GetReadableData(Stream stream, long offset, int size) | ||
| { | ||
| if (stream is MemoryStream ms) | ||
| { | ||
| // avoid copying where possible | ||
| var buf = ms.GetBuffer(); | ||
| return new ReadOnlySpan<byte>(buf, (int)offset, Math.Min(size, buf.Length - (int)offset)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
here you truncate, but below you don't.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. below it's already the right size |
||
| } | ||
|
|
||
| var buffer = new byte[size]; | ||
| stream.Seek(offset, SeekOrigin.Begin); | ||
| stream.ReadExactly(buffer); | ||
| return buffer; | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ public interface ISoundFormat : IDisposable | |
| int SampleRate { get; } | ||
| float LengthInSeconds { get; } | ||
| Stream GetPCMInputStream(); | ||
| byte[] GetPCMData(); | ||
| } | ||
|
|
||
| public enum SoundType { World, UI } | ||
|
|
@@ -93,7 +94,7 @@ public void Initialize(ISoundLoader[] loaders, IReadOnlyFileSystem fileSystem) | |
| this.loaders = loaders; | ||
| this.fileSystem = fileSystem; | ||
| ISoundSource LoadIntoMemory(ISoundFormat soundFormat) => soundEngine.AddSoundSourceFromMemory( | ||
| soundFormat.GetPCMInputStream().ReadAllBytes(), soundFormat.Channels, soundFormat.SampleBits, soundFormat.SampleRate); | ||
| soundFormat.GetPCMData(), soundFormat.Channels, soundFormat.SampleBits, soundFormat.SampleRate); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid polluting the existing classes with extra API surface here, perhaps instead the Stream could detect a
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean codegen?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I mean in the Read method of the Stream you could do something conceptually akin to: public override int Read(Span<byte> buffer)
{
if (position == 0 && buffer.Length == totalLengthOfFile)
{
// This is a request to read the whole stream in one go - use the fast-path here.
}
// This is a request to read a portion of the file, use the slow-path.
}This means we can have a fast-path and slow-path - but they're both exposed via the existing Stream abstraction, so we don't need a new API surface for getting a byte array directly.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or if that's still too late in the process - then copy things into a MemoryStream ahead of time and return that instead. |
||
| sounds = new Cache<string, ISoundSource>(filename => LoadSound(filename, LoadIntoMemory)); | ||
| currentSounds.Clear(); | ||
| currentNotifications.Clear(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,6 +42,12 @@ public sealed class OggFormat : ISoundFormat | |
| public int SampleRate => reader.SampleRate; | ||
| public float LengthInSeconds { get; } | ||
| public Stream GetPCMInputStream() { return new OggStream(new OggFormat(this)); } | ||
| public byte[] GetPCMData() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. above you have the same code for a diff audio format. feels like this function should be in a base class. or a helper.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's here just a s place holder. Ideally every format would implement their own fast path |
||
| { | ||
| using var pcmStream = GetPCMInputStream(); | ||
| return pcmStream.ReadAllBytes(); | ||
| } | ||
|
|
||
| public void Dispose() { reader.Dispose(); } | ||
|
|
||
| readonly VorbisReader reader; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,10 @@ | |
| #endregion | ||
|
|
||
| using System; | ||
| using System.IO; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Runtime.InteropServices; | ||
| using OpenRA.Primitives; | ||
|
|
||
| namespace OpenRA.Mods.Common.FileFormats | ||
| { | ||
|
|
@@ -30,39 +34,49 @@ public static class ImaAdpcmReader | |
| 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767 | ||
| ]; | ||
|
|
||
| public static short DecodeImaAdpcmSample(byte b, ref int index, ref int current) | ||
| /// <summary> | ||
| /// Decodes a single IMA ADPCM nibble to a PCM sample. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// Branchless and only the output variables leave registers. | ||
| /// </remarks> | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public static short DecodeImaAdpcmSample(byte nibble, ref byte idx, ref short pred) | ||
| { | ||
| var sb = (b & 8) != 0; | ||
| b &= 7; | ||
| var step = StepTable[idx]; | ||
| var diff = step >> 3; | ||
|
|
||
| var delta = StepTable[index] * b / 4 + StepTable[index] / 8; | ||
| if (sb) | ||
| delta = -delta; | ||
| var mask = nibble & 7; | ||
| diff += ((mask >> 2) & 1) * step; | ||
| diff += ((mask >> 1) & 1) * (step >> 1); | ||
| diff += (mask & 1) * (step >> 2); | ||
|
|
||
| current += delta; | ||
| if (current > short.MaxValue) | ||
| current = short.MaxValue; | ||
| // branchless negation via bitmask | ||
| var sign = (nibble & 8) != 0 ? -1 : 1; | ||
| diff *= sign; | ||
|
|
||
| if (current < short.MinValue) | ||
| current = short.MinValue; | ||
| var sample = pred + diff; | ||
|
|
||
| index += IndexAdjust[b]; | ||
| if (index < 0) | ||
| index = 0; | ||
| // branchless clamping (fast saturating logic) | ||
| if ((uint)(sample - short.MinValue) > ushort.MaxValue) | ||
| sample = sample > 0 ? short.MaxValue : short.MinValue; | ||
|
|
||
| if (index > 88) | ||
| index = 88; | ||
| pred = (short)sample; | ||
|
|
||
| return (short)current; | ||
| var newIdx = idx + IndexAdjust[mask]; | ||
| newIdx = newIdx < 0 ? 0 : newIdx > 88 ? 88 : newIdx; | ||
| idx = (byte)newIdx; | ||
|
|
||
| return pred; | ||
| } | ||
|
|
||
| public static byte[] LoadImaAdpcmSound(ReadOnlySpan<byte> raw, ref int index) | ||
| public static byte[] LoadImaAdpcmSound(ReadOnlySpan<byte> raw, ref byte index) | ||
| { | ||
| var currentSample = 0; | ||
| short currentSample = 0; | ||
| return LoadImaAdpcmSound(raw, ref index, ref currentSample); | ||
| } | ||
|
|
||
| public static byte[] LoadImaAdpcmSound(ReadOnlySpan<byte> raw, ref int index, ref int currentSample) | ||
| public static byte[] LoadImaAdpcmSound(ReadOnlySpan<byte> raw, ref byte index, ref short currentSample) | ||
| { | ||
| var dataSize = raw.Length; | ||
| var outputSize = raw.Length * 4; | ||
|
|
@@ -85,5 +99,114 @@ public static byte[] LoadImaAdpcmSound(ReadOnlySpan<byte> raw, ref int index, re | |
|
|
||
| return output; | ||
| } | ||
|
|
||
| public static byte[] ReadData(Stream stream, long dataOffset, int dataSize, short blockAlign, short channels) | ||
| { | ||
| const int SamplesPerGroup = 8; | ||
|
|
||
| ArgumentNullException.ThrowIfNull(stream); | ||
|
|
||
| var sourceData = SegmentStream.GetReadableData(stream, dataOffset, dataSize); | ||
|
|
||
| var numBlocks = dataSize / blockAlign; | ||
|
|
||
| var predictorSize = 4 * channels; | ||
| var blockDataSize = blockAlign - predictorSize; | ||
|
|
||
| // We get two samples per nibble | ||
| var samplesPerChannel = blockDataSize * 2 / channels; | ||
|
|
||
| // 8 samples from a 4-byte group | ||
| var groupCount = samplesPerChannel / SamplesPerGroup; | ||
|
|
||
| var predOut = numBlocks * channels * 2; | ||
| var groupOut = numBlocks * groupCount * channels * SamplesPerGroup * 2; | ||
| var estimatedOutDataSize = predOut + groupOut; | ||
|
|
||
| var outData = new byte[estimatedOutDataSize]; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I recall from the spec these files can have a form of compression applied, so they might be larger once uncompressed? I see the commit is saying the chunk for the uncompressed size is often not reliable - but even if we're ignoring what the file claims will be the final size, I'm not seing how we're dealing with it here?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are calculating the total uncompressed size (unrelated: in many ways this is the bound check)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ImaAdpcm format is the compression, it reduces file size to 1/4th the size and the headers do remain uncompressed |
||
|
|
||
| // PERF: The output is 16-bit PCM, so we can write bytes as if they were shorts for less CPU churn. | ||
| var outShorts = MemoryMarshal.Cast<byte, short>(outData.AsSpan()); | ||
|
|
||
| // NOTE: decoding a block is sequentually dependent on predictor/index. | ||
| Span<short> predictor = stackalloc short[channels]; | ||
| Span<byte> index = stackalloc byte[channels]; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'll need to validate the channel count is reasonable - wouldn't want to stack allocate if the file claimed to have short.MaxValue channels for example.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it should always be 1 or 2 channels. We could of course reject if there are more
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason it's important is that the process can't recover from a stack overflow - so a malformed or malicious file will crash the process if we stackalloc too large a value. For other parsing errors we could try-catch and just not play the audio - but a stack overflow kills our process outright. |
||
|
|
||
| // PERF: Avoid bounds checks by using refs. | ||
| ref var srcRef = ref MemoryMarshal.GetReference(sourceData); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unsafe code for an operation as dangerous as file parsing to avoid a few bounds checks is insane to me.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are working on already bound checked data, removing bound checks in hot loops did improve the perf by 3-4%
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But the bounds check you've calculated is based on input controlled by the file. You can't trust the file. If the file is malformed or malicious, then we won't have the correct bounds. We appear to have exactly those sorts of bugs here, we'll read input of length A well-formed file will have everything match up so we consume the right amount of input and produce the right amount of output. A malformed or malicious file can make false claims where things don't line up, meaning we'll walk off the end of the input or output arrays. If that happened in our existing parsers, we'd get an array out of bounds exception and could recover and keep running. By using unsafe, we've elevated the bug into either an unrecoverable crash if we read/write out of our process memory - or worse a security issue if the file is controlled by an attacker and the bad accesses can be manipulated in a manner advantageous to them. If we used safe code, then at least bugs like this have reduced impact potiential - by dropping to unsafe we've signed ourselves up to much worse outcomes if the code contains any bugs. If I had a magical lever to downgrade the impact of bugs I wrote from "security issue/denial-of-service" to "mostly recoverable" and it only cost me 3-4% perf, I'd pull that lever most times (in this case, the lever is "don't use unsafe code to skip bounds checks")
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, if we really are sensitive to 3-4% perf then the better solution is to load those files up-front behind the load screen and keep them in ram! |
||
| ref var outRef = ref MemoryMarshal.GetReference(outShorts); | ||
| ref var predRef = ref MemoryMarshal.GetReference(predictor); | ||
| ref var idxRef = ref MemoryMarshal.GetReference(index); | ||
|
|
||
| // Global decoded sample counter | ||
| var src = 0; | ||
| var outSample = 0; | ||
|
|
||
| for (var block = 0; block < numBlocks; block++) | ||
| { | ||
| // Initial states | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would put this init block in a separate function. Assuming you don't reference too many local vars.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's so little code it's using all local variables |
||
| for (var c = 0; c < channels; c++) | ||
| { | ||
| var offset = src + c * 4; | ||
|
|
||
| // Load initial values. | ||
| var pred = (short)(Unsafe.Add(ref srcRef, offset) | (Unsafe.Add(ref srcRef, offset + 1) << 8)); | ||
| var idx = Unsafe.Add(ref srcRef, offset + 2); | ||
|
|
||
| Unsafe.Add(ref predRef, c) = pred; | ||
| Unsafe.Add(ref idxRef, c) = idx; | ||
| } | ||
|
|
||
| src += predictorSize; | ||
|
|
||
| // Write initial predictor samples interleaved | ||
| for (var c = 0; c < channels; c++) | ||
| Unsafe.Add(ref outRef, outSample + c) = Unsafe.Add(ref predRef, c); | ||
|
|
||
| outSample += channels; | ||
|
|
||
| for (var iter = 0; iter < groupCount; iter++) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're decoding in a batch of 8 at a time, I assume as a form of loop unrolling? Is there a guarentee that chunks have a multiple of 8 in them? If not, we're need another loop to catch the leftovers or we're going to miss out some samples.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is guaranteed, if there's an incomplete group at the end of the file we should just skip it
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. though, the fast path will just fail and decode garbage |
||
| { | ||
| // Decode 8 samples sequentially per channel | ||
| for (var c = 0; c < channels; c++) | ||
| { | ||
| ref var pred = ref Unsafe.Add(ref predRef, c); | ||
| ref var idx = ref Unsafe.Add(ref idxRef, c); | ||
|
|
||
| var b0 = Unsafe.Add(ref srcRef, src + 0); | ||
| var b1 = Unsafe.Add(ref srcRef, src + 1); | ||
| var b2 = Unsafe.Add(ref srcRef, src + 2); | ||
| var b3 = Unsafe.Add(ref srcRef, src + 3); | ||
|
|
||
| src += 4; | ||
|
|
||
| // PERF: Decode into temporary variables so they could be easily inlined directly to output. | ||
| var s0 = DecodeImaAdpcmSample((byte)(b0 & 0x0F), ref idx, ref pred); | ||
| var s1 = DecodeImaAdpcmSample((byte)(b0 >> 4), ref idx, ref pred); | ||
| var s2 = DecodeImaAdpcmSample((byte)(b1 & 0x0F), ref idx, ref pred); | ||
| var s3 = DecodeImaAdpcmSample((byte)(b1 >> 4), ref idx, ref pred); | ||
| var s4 = DecodeImaAdpcmSample((byte)(b2 & 0x0F), ref idx, ref pred); | ||
| var s5 = DecodeImaAdpcmSample((byte)(b2 >> 4), ref idx, ref pred); | ||
| var s6 = DecodeImaAdpcmSample((byte)(b3 & 0x0F), ref idx, ref pred); | ||
| var s7 = DecodeImaAdpcmSample((byte)(b3 >> 4), ref idx, ref pred); | ||
|
|
||
| // Write interleaved samples (one sample per channel) | ||
| var basePos = outSample + c; | ||
| Unsafe.Add(ref outRef, basePos + channels * 0) = s0; | ||
| Unsafe.Add(ref outRef, basePos + channels * 1) = s1; | ||
| Unsafe.Add(ref outRef, basePos + channels * 2) = s2; | ||
| Unsafe.Add(ref outRef, basePos + channels * 3) = s3; | ||
| Unsafe.Add(ref outRef, basePos + channels * 4) = s4; | ||
| Unsafe.Add(ref outRef, basePos + channels * 5) = s5; | ||
| Unsafe.Add(ref outRef, basePos + channels * 6) = s6; | ||
| Unsafe.Add(ref outRef, basePos + channels * 7) = s7; | ||
| } | ||
|
|
||
| outSample += channels * 8; | ||
| } | ||
| } | ||
|
|
||
| return outData; | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method is a bit weird and I think should be moved to the one caller as a private helper.
TryGet...method instead, in case it could reuse an existing buffer in the non-MemoryStream case.