diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index e14c11ca..f8ec91b3 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -207,6 +207,17 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) output_ptr[off + 7] += w7 * input_ptr[off + 7]; } } + else if (channels == 3) + { + const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; + for (int f = 0; f < num_frames; f++) + { + const int off = f * 3; + output_ptr[off] += w0 * input_ptr[off]; + output_ptr[off + 1] += w1 * input_ptr[off + 1]; + output_ptr[off + 2] += w2 * input_ptr[off + 2]; + } + } else { // General depthwise path with loop unrolling @@ -349,6 +360,52 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) (w0_10 * i0_0 + w0_11 * i0_1) + (w1_10 * i1_0 + w1_11 * i1_1) + (w2_10 * i2_0 + w2_11 * i2_1); } } + else if (kernel_size == 6 && out_ch == 3 && in_ch == 3) + { + // Fused 3x3 kernel_size=6: read all 6 input blocks and compute in one pass + const long dil = this->_dilation; + auto in0 = _input_buffer.Read(num_frames, 5 * dil); + auto in1 = _input_buffer.Read(num_frames, 4 * dil); + auto in2 = _input_buffer.Read(num_frames, 3 * dil); + auto in3 = _input_buffer.Read(num_frames, 2 * dil); + auto in4 = _input_buffer.Read(num_frames, dil); + auto in5 = _input_buffer.Read(num_frames, 0); + + const float* __restrict__ in_ptrs[6] = { + in0.data(), in1.data(), in2.data(), + in3.data(), in4.data(), in5.data() + }; + float* __restrict__ output_ptr = _output.data(); + + // Cache all 54 weights on stack (6 taps x 3x3 matrix, column-major) + float w[6][9]; + for (int k = 0; k < 6; k++) + { + const float* wp = this->_weight[k].data(); + for (int j = 0; j < 9; j++) + w[k][j] = wp[j]; + } + + for (int f = 0; f < num_frames; f++) + { + const int off = f * 3; + float o0 = 0.0f, o1 = 0.0f, o2 = 0.0f; + + for (int k = 0; k < 6; k++) + { + const float i0 = in_ptrs[k][off]; + const float i1 = in_ptrs[k][off + 1]; + const float i2 = in_ptrs[k][off + 2]; + o0 += w[k][0] * i0 + w[k][3] * i1 + w[k][6] * i2; + o1 += w[k][1] * i0 + w[k][4] * i1 + w[k][7] * i2; + o2 += w[k][2] * i0 + w[k][5] * i1 + w[k][8] * i2; + } + + output_ptr[off] = o0; + output_ptr[off + 1] = o1; + output_ptr[off + 2] = o2; + } + } else { // General inline GEMM path uses += accumulation, so needs setZero diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 2cabf22d..3aa22304 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -31,7 +31,7 @@ void nam::DSP::prewarm() { if (mMaxBufferSize == 0) { - SetMaxBufferSize(4096); + SetMaxBufferSize(NAM_DEFAULT_MAX_BUFFER_SIZE); } const int prewarmSamples = PrewarmSamples(); if (prewarmSamples == 0) @@ -467,6 +467,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu void nam::Conv1x1::process_(const Eigen::Ref& input, const int num_frames) { assert(num_frames <= _output.cols()); +#ifdef NAM_USE_INLINE_GEMM + bool bias_fused = false; +#endif if (this->_is_depthwise) { @@ -499,6 +502,17 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons output_ptr[f * 2 + 1] = w1 * in_val; } } + else if (out_ch == 3 && in_ch == 1) + { + const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; + for (int f = 0; f < num_frames; f++) + { + const float in_val = input_ptr[f * in_stride]; + output_ptr[f * 3] = w0 * in_val; + output_ptr[f * 3 + 1] = w1 * in_val; + output_ptr[f * 3 + 2] = w2 * in_val; + } + } else if (out_ch == 4 && in_ch == 1) { const float w0 = weight_ptr[0], w1 = weight_ptr[1]; @@ -521,6 +535,28 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons output_ptr[f] = w0 * in_col[0] + w1 * in_col[1]; } } + else if (out_ch == 1 && in_ch == 3) + { + const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; + if (this->_do_bias) + { + const float b0 = this->_bias(0); + for (int f = 0; f < num_frames; f++) + { + const float* __restrict__ in_col = input_ptr + f * in_stride; + output_ptr[f] = w0 * in_col[0] + w1 * in_col[1] + w2 * in_col[2] + b0; + } + bias_fused = true; + } + else + { + for (int f = 0; f < num_frames; f++) + { + const float* __restrict__ in_col = input_ptr + f * in_stride; + output_ptr[f] = w0 * in_col[0] + w1 * in_col[1] + w2 * in_col[2]; + } + } + } else if (out_ch == 2 && in_ch == 2) { // 2x2 fully unrolled @@ -583,15 +619,33 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2]; const float w01 = weight_ptr[3], w11 = weight_ptr[4], w21 = weight_ptr[5]; const float w02 = weight_ptr[6], w12 = weight_ptr[7], w22 = weight_ptr[8]; - for (int f = 0; f < num_frames; f++) + if (this->_do_bias) { - const float* __restrict__ in_col = input_ptr + f * in_stride; - const float i0 = in_col[0]; - const float i1 = in_col[1]; - const float i2 = in_col[2]; - output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2; - output_ptr[f * 3 + 1] = w10 * i0 + w11 * i1 + w12 * i2; - output_ptr[f * 3 + 2] = w20 * i0 + w21 * i1 + w22 * i2; + const float b0 = this->_bias(0), b1 = this->_bias(1), b2 = this->_bias(2); + for (int f = 0; f < num_frames; f++) + { + const float* __restrict__ in_col = input_ptr + f * in_stride; + const float i0 = in_col[0]; + const float i1 = in_col[1]; + const float i2 = in_col[2]; + output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2 + b0; + output_ptr[f * 3 + 1] = w10 * i0 + w11 * i1 + w12 * i2 + b1; + output_ptr[f * 3 + 2] = w20 * i0 + w21 * i1 + w22 * i2 + b2; + } + bias_fused = true; + } + else + { + for (int f = 0; f < num_frames; f++) + { + const float* __restrict__ in_col = input_ptr + f * in_stride; + const float i0 = in_col[0]; + const float i1 = in_col[1]; + const float i2 = in_col[2]; + output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2; + output_ptr[f * 3 + 1] = w10 * i0 + w11 * i1 + w12 * i2; + output_ptr[f * 3 + 2] = w20 * i0 + w21 * i1 + w22 * i2; + } } } else if (out_ch == 4 && in_ch == 4) @@ -673,8 +727,21 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons } else { - // Fall back to Eigen for larger matrices where it's more efficient - _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); + // Generic inline GEMM for any matrix size (avoids Eigen overhead for small matrices) + for (int f = 0; f < num_frames; f++) + { + const float* __restrict__ in_col = input_ptr + f * in_stride; + float* __restrict__ out_col = output_ptr + f * out_ch; + for (int o = 0; o < out_ch; o++) + { + float sum = 0.0f; + for (int i = 0; i < in_ch; i++) + { + sum += weight_ptr[i * out_ch + o] * in_col[i]; + } + out_col[o] = sum; + } + } } #else // Single GEMM for all cases - block-diagonal zero structure handles grouping @@ -685,6 +752,8 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons if (this->_do_bias) { #ifdef NAM_USE_INLINE_GEMM + if (!bias_fused) + { const int out_ch = (int)get_out_channels(); float* __restrict__ output_ptr = _output.data(); const float* __restrict__ bias_ptr = this->_bias.data(); @@ -700,6 +769,17 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons output_ptr[off + 1] += b1; } } + else if (out_ch == 3) + { + const float b0 = bias_ptr[0], b1 = bias_ptr[1], b2 = bias_ptr[2]; + for (int f = 0; f < num_frames; f++) + { + const int off = f * 3; + output_ptr[off] += b0; + output_ptr[off + 1] += b1; + output_ptr[off + 2] += b2; + } + } else if (out_ch == 4) { const float b0 = bias_ptr[0], b1 = bias_ptr[1]; @@ -724,6 +804,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons } } } + } // !bias_fused #else _output.leftCols(num_frames).colwise() += this->_bias; #endif diff --git a/NAM/dsp.h b/NAM/dsp.h index 15bc9b81..f7ff4ed5 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -18,6 +18,12 @@ #else #define NAM_SAMPLE double #endif +/// \brief Default max buffer size used by prewarm() when none has been set. +/// Override at compile time with -DNAM_DEFAULT_MAX_BUFFER_SIZE=N. +#ifndef NAM_DEFAULT_MAX_BUFFER_SIZE + #define NAM_DEFAULT_MAX_BUFFER_SIZE 4096 +#endif + /// \brief Use a sample rate of -1 if we don't know what the model expects to be run at #define NAM_UNKNOWN_EXPECTED_SAMPLE_RATE -1.0 diff --git a/NAM/film.h b/NAM/film.h index 559de131..27685269 100644 --- a/NAM/film.h +++ b/NAM/film.h @@ -98,47 +98,78 @@ class FiLM if (_do_shift) { // scale = top input_dim rows, shift = bottom input_dim rows - for (int f = 0; f < num_frames; f++) + if (input_dim == 3) { - const float* __restrict__ in_col = input_ptr + f * input_stride; - const float* __restrict__ scale_col = scale_shift_ptr + f * scale_shift_rows; - const float* __restrict__ shift_col = scale_col + input_dim; - float* __restrict__ out_col = output_ptr + f * input_dim; - - int i = 0; - for (; i + 3 < input_dim; i += 4) + for (int f = 0; f < num_frames; f++) { - out_col[i] = in_col[i] * scale_col[i] + shift_col[i]; - out_col[i + 1] = in_col[i + 1] * scale_col[i + 1] + shift_col[i + 1]; - out_col[i + 2] = in_col[i + 2] * scale_col[i + 2] + shift_col[i + 2]; - out_col[i + 3] = in_col[i + 3] * scale_col[i + 3] + shift_col[i + 3]; + const float* __restrict__ in_col = input_ptr + f * input_stride; + const float* __restrict__ scale_col = scale_shift_ptr + f * scale_shift_rows; + const float* __restrict__ shift_col = scale_col + 3; + float* __restrict__ out_col = output_ptr + f * 3; + out_col[0] = in_col[0] * scale_col[0] + shift_col[0]; + out_col[1] = in_col[1] * scale_col[1] + shift_col[1]; + out_col[2] = in_col[2] * scale_col[2] + shift_col[2]; } - for (; i < input_dim; i++) + } + else + { + for (int f = 0; f < num_frames; f++) { - out_col[i] = in_col[i] * scale_col[i] + shift_col[i]; + const float* __restrict__ in_col = input_ptr + f * input_stride; + const float* __restrict__ scale_col = scale_shift_ptr + f * scale_shift_rows; + const float* __restrict__ shift_col = scale_col + input_dim; + float* __restrict__ out_col = output_ptr + f * input_dim; + + int i = 0; + for (; i + 3 < input_dim; i += 4) + { + out_col[i] = in_col[i] * scale_col[i] + shift_col[i]; + out_col[i + 1] = in_col[i + 1] * scale_col[i + 1] + shift_col[i + 1]; + out_col[i + 2] = in_col[i + 2] * scale_col[i + 2] + shift_col[i + 2]; + out_col[i + 3] = in_col[i + 3] * scale_col[i + 3] + shift_col[i + 3]; + } + for (; i < input_dim; i++) + { + out_col[i] = in_col[i] * scale_col[i] + shift_col[i]; + } } } } else { // scale only - for (int f = 0; f < num_frames; f++) + if (input_dim == 3) { - const float* __restrict__ in_col = input_ptr + f * input_stride; - const float* __restrict__ scale_col = scale_shift_ptr + f * scale_shift_rows; - float* __restrict__ out_col = output_ptr + f * input_dim; - - int i = 0; - for (; i + 3 < input_dim; i += 4) + for (int f = 0; f < num_frames; f++) { - out_col[i] = in_col[i] * scale_col[i]; - out_col[i + 1] = in_col[i + 1] * scale_col[i + 1]; - out_col[i + 2] = in_col[i + 2] * scale_col[i + 2]; - out_col[i + 3] = in_col[i + 3] * scale_col[i + 3]; + const float* __restrict__ in_col = input_ptr + f * input_stride; + const float* __restrict__ scale_col = scale_shift_ptr + f * scale_shift_rows; + float* __restrict__ out_col = output_ptr + f * 3; + out_col[0] = in_col[0] * scale_col[0]; + out_col[1] = in_col[1] * scale_col[1]; + out_col[2] = in_col[2] * scale_col[2]; } - for (; i < input_dim; i++) + } + else + { + for (int f = 0; f < num_frames; f++) { - out_col[i] = in_col[i] * scale_col[i]; + const float* __restrict__ in_col = input_ptr + f * input_stride; + const float* __restrict__ scale_col = scale_shift_ptr + f * scale_shift_rows; + float* __restrict__ out_col = output_ptr + f * input_dim; + + int i = 0; + for (; i + 3 < input_dim; i += 4) + { + out_col[i] = in_col[i] * scale_col[i]; + out_col[i + 1] = in_col[i + 1] * scale_col[i + 1]; + out_col[i + 2] = in_col[i + 2] * scale_col[i + 2]; + out_col[i + 3] = in_col[i + 3] * scale_col[i + 3]; + } + for (; i < input_dim; i++) + { + out_col[i] = in_col[i] * scale_col[i]; + } } } } diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 4f7e179e..522e304d 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -223,42 +223,66 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma { // (No FiLM) // Store output to head (skip connection: activated conv output) -#ifdef NAM_USE_INLINE_GEMM - if (this->_gating_mode == GatingMode::NONE) - { - // _z has bottleneck rows, data is contiguous - use memcpy - const int total = (int)bottleneck * num_frames; - std::memcpy(this->_output_head.data(), this->_z.data(), total * sizeof(float)); - } - else + // When _skip_head_copy is true, GetOutputHead() returns _z directly, so no copy needed. + if (!this->_skip_head_copy) { - // _z has 2*bottleneck rows but we only want top bottleneck rows - // Column-major: need to copy column by column with stride - const int out_rows = (int)bottleneck; - const int z_rows = (int)this->_z.rows(); // 2*bottleneck for gated - const float* __restrict__ src = this->_z.data(); - float* __restrict__ dst = this->_output_head.data(); - for (int f = 0; f < num_frames; f++) +#ifdef NAM_USE_INLINE_GEMM + if (this->_gating_mode == GatingMode::NONE) { - const float* __restrict__ src_col = src + f * z_rows; - float* __restrict__ dst_col = dst + f * out_rows; - for (int r = 0; r < out_rows; r++) - dst_col[r] = src_col[r]; + // _z has bottleneck rows, data is contiguous - use memcpy + const int total = (int)bottleneck * num_frames; + std::memcpy(this->_output_head.data(), this->_z.data(), total * sizeof(float)); + } + else + { + // _z has 2*bottleneck rows but we only want top bottleneck rows + // Column-major: need to copy column by column with stride + const int out_rows = (int)bottleneck; + const int z_rows = (int)this->_z.rows(); // 2*bottleneck for gated + const float* __restrict__ src = this->_z.data(); + float* __restrict__ dst = this->_output_head.data(); + for (int f = 0; f < num_frames; f++) + { + const float* __restrict__ src_col = src + f * z_rows; + float* __restrict__ dst_col = dst + f * out_rows; + for (int r = 0; r < out_rows; r++) + dst_col[r] = src_col[r]; + } } - } #else - if (this->_gating_mode == GatingMode::NONE) - this->_output_head.leftCols(num_frames).noalias() = this->_z.leftCols(num_frames); - else - this->_output_head.leftCols(num_frames).noalias() = this->_z.topRows(bottleneck).leftCols(num_frames); + if (this->_gating_mode == GatingMode::NONE) + this->_output_head.leftCols(num_frames).noalias() = this->_z.leftCols(num_frames); + else + this->_output_head.leftCols(num_frames).noalias() = this->_z.topRows(bottleneck).leftCols(num_frames); #endif + } } // Store output to next layer (residual connection: input + layer1x1 output, or just input if layer1x1 inactive) if (this->_layer1x1) { +#ifdef NAM_USE_INLINE_GEMM + { + const int channels = (int)this->get_channels(); + const int total = channels * num_frames; + const float* __restrict__ in_ptr = input.data(); + const float* __restrict__ layer_ptr = this->_layer1x1->GetOutput().data(); + float* __restrict__ out_ptr = this->_output_next_layer.data(); + int i = 0; + for (; i + 3 < total; i += 4) + { + out_ptr[i] = in_ptr[i] + layer_ptr[i]; + out_ptr[i + 1] = in_ptr[i + 1] + layer_ptr[i + 1]; + out_ptr[i + 2] = in_ptr[i + 2] + layer_ptr[i + 2]; + out_ptr[i + 3] = in_ptr[i + 3] + layer_ptr[i + 3]; + } + for (; i < total; i++) + out_ptr[i] = in_ptr[i] + layer_ptr[i]; + } +#else this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames) + this->_layer1x1->GetOutput().leftCols(num_frames); +#endif } else { @@ -370,7 +394,25 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs } // Accumulate head output from this layer +#ifdef NAM_USE_INLINE_GEMM + { + const int total = (int)this->_head_output_size * num_frames; + const float* __restrict__ src = this->_layers[i].GetOutputHead().data(); + float* __restrict__ dst = this->_head_inputs.data(); + int j = 0; + for (; j + 3 < total; j += 4) + { + dst[j] += src[j]; + dst[j + 1] += src[j + 1]; + dst[j + 2] += src[j + 2]; + dst[j + 3] += src[j + 3]; + } + for (; j < total; j++) + dst[j] += src[j]; + } +#else this->_head_inputs.leftCols(num_frames).noalias() += this->_layers[i].GetOutputHead().leftCols(num_frames); +#endif } // Store output from last layer - use memcpy for pure copy diff --git a/NAM/wavenet.h b/NAM/wavenet.h index cf84bd20..ebb551a5 100644 --- a/NAM/wavenet.h +++ b/NAM/wavenet.h @@ -242,6 +242,10 @@ class _Layer } } + // When no head1x1 and no gating, _output_head would be a straight copy of _z. + // Skip the copy and return _z directly from GetOutputHead(). + _skip_head_copy = !params.head1x1_params.active && params.gating_mode == GatingMode::NONE; + // Validate & initialize gating/blending activation if (params.gating_mode == GatingMode::GATED) { @@ -361,12 +365,14 @@ class _Layer /// /// Returns the full pre-allocated buffer; only the first num_frames columns /// are valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// When _skip_head_copy is true (no head1x1, no gating), returns _z directly + /// to avoid a redundant memcpy. /// \return Reference to the head output buffer - Eigen::MatrixXf& GetOutputHead() { return this->_output_head; } + Eigen::MatrixXf& GetOutputHead() { return _skip_head_copy ? this->_z : this->_output_head; } /// \brief Get output to head (const version) /// \return Const reference to the head output buffer - const Eigen::MatrixXf& GetOutputHead() const { return this->_output_head; } + const Eigen::MatrixXf& GetOutputHead() const { return _skip_head_copy ? this->_z : this->_output_head; } /// \brief Access Conv1D for Reset() propagation (needed for _LayerArray) /// \return Reference to the internal Conv1D object @@ -395,6 +401,7 @@ class _Layer activations::Activation::Ptr _activation; const GatingMode _gating_mode; const int _bottleneck; // Internal channel count (not doubled when gated) + bool _skip_head_copy = false; // When true, GetOutputHead() returns _z directly (no head1x1, no gating) // Gating/blending activation objects std::unique_ptr _gating_activation;