File size: 10,380 Bytes
d1a84ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_
#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_

#include <cstdint>
#include <vector>

// IWYU pragma: begin_exports
#include "sparse_matmul/compute/ar_inputs.h"
#include "sparse_matmul/compute/gru_gates_arm.h"
#include "sparse_matmul/compute/gru_gates_avx_fixed.h"
#include "sparse_matmul/compute/gru_gates_generic.h"
#include "sparse_matmul/compute/matmul.h"
#include "sparse_matmul/numerics/fixed_types.h"
#include "sparse_matmul/numerics/type_utils.h"
#include "sparse_matmul/vector/cache_aligned_vector.h"
// IWYU pragma: end_exports

namespace csrblocksparse {

// The master template is really a catch-all for the unimplemented cases to
// run the generics.
template <typename GRUStateType, typename InputType, typename SampleType = void>
class GruGates : public MatmulBase {
 public:
  using SampleWeightType = float;
  static constexpr int kSIMDWidth = kGenericSIMDWidth;

  // Generic GRU function covers all uses for WaveRNN-like architectures and
  // conditioning.
  // Controlled by template parameters thus:
  // - |kInputsMode| == |k0ARInputs|: There are no autoregressive inputs so
  //   |ar_sample0|, |ar_sample1|, |ar_sample2|, |ar_01_weights|,
  //   |ar_2_weights| are ignored.
  // - |kInputsMode| == |k2ARInputs|: |ar_sample0|, |ar_sample1| are multiplied
  //   by |ar_01_weights| and added to the (conditioning) input.
  // - |kInputsMode| == |k3ARInputs|: |ar_sample2| is multiplied by
  //   |ar_2_weights| and added to the other two |ar_inputs| (and added to the
  //   conditioning input).
  // - If |kSplitGates| is true: The |*gru_recurrent_other_ptr| is secondary
  //   recurrent input that must be added to |*gru_recurrent_ptr|.
  // - |num_replicas| determines the number of duplicates of the output to be
  //   written, separated by |replica_stride|.
  // - |start|, |end| are |rows| in [0, |state_size|] to be processed by this
  //   thread.
  //
  // Previous state is read from |*gru_state_ptr| and the new state is written
  // to *(|gru_state_ptr| + i * |replica_stride| for i in [0, |num_replicas|)).
  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
            bool kSplitGates = false>
  void GruWithARInput(int start, int end, int state_size,
                      const InputType* gru_recurrent_ptr,
                      const InputType* input_ptr, GRUStateType* gru_state_ptr,
                      const SampleType* ar_sample0 = nullptr,
                      const SampleType* ar_sample1 = nullptr,
                      const SampleWeightType* ar_01_weights = nullptr,
                      int num_replicas = 1, int replica_stride = 0,
                      const SampleType* ar_sample2 = nullptr,
                      const SampleWeightType* ar_2_weights = nullptr,
                      const InputType* gru_recurrent_other_ptr = nullptr) {
    CHECK_EQ(num_replicas, 1) << "Generic code should always have 1 replica";
    GoThroughGates<GRUStateType, InputType, SampleWeightType, SampleType,
                   kInputsMode, kSplitGates>(
        start, end, ar_01_weights, gru_recurrent_ptr, gru_recurrent_other_ptr,
        input_ptr, gru_state_ptr, ar_2_weights, state_size, ar_sample0,
        ar_sample1, ar_sample2);
  }

  // No AR inputs, no split gates, no batching, no replicated outputs.
  // TODO(b/188702959): Redirect conditioning GRU here, removing code from
  // gru_layer.h.
  // Copy to specializations.
  void PlainGru(int start, int end, int state_size,
                const InputType* gru_recurrent_ptr, const InputType* input_ptr,
                GRUStateType* gru_state_ptr) {
    GruWithARInput<ARInputsMode::k0ARInputs>(
        start, end, state_size, gru_recurrent_ptr, input_ptr, gru_state_ptr);
  }
};

#if defined __ARM_NEON || defined __aarch64__
// Partial specialization for float.
template <>
class GruGates<float, float, float> : public MatmulBase {
 public:
  static constexpr int kSIMDWidth = kNeonSIMDWidth;

  // Generic GRU function covers all uses for WaveRNN-like architectures and
  // conditioning.
  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
            bool kSplitGates = false>
  void GruWithARInput(int start, int end, int state_size,
                      const float* gru_recurrent_data, const float* input_data,
                      float* gru_state_data, const float* ar_sample0 = nullptr,
                      const float* ar_sample1 = nullptr,
                      const float* ar_01_weights = nullptr,
                      int num_replicas = 1, int replica_stride = 0,
                      const float* ar_sample2 = nullptr,
                      const float* ar_2_weights = nullptr,
                      const float* gru_recurrent_other_data = nullptr) {
    DCHECK_EQ(num_replicas, 1) << "ARM code should always have 1 replica";
    GoThroughGatesFloat<kInputsMode, kSplitGates>(
        start, end, ar_01_weights, gru_recurrent_data, gru_recurrent_other_data,
        input_data, gru_state_data, ar_2_weights, state_size, ar_sample0,
        ar_sample1, ar_sample2);
  }
};
#endif  // defined __ARM_NEON || defined __aarch64__

// Partial specialization for fixed types. The sample weights are always float
// whatever the fixed type of the other weights.
template <int kGRUStateBits, int kInputBits, int kSampleBits>
class GruGates<fixed16<kGRUStateBits>, fixed32<kInputBits>,
               fixed16<kSampleBits>> : public MatmulBase {
 public:
#if defined __ARM_NEON || defined __aarch64__
  static constexpr int kSIMDWidth = kNeonSIMDWidth;
#elif defined __AVX2__
  static constexpr int kSIMDWidth = kAVX2SIMDWidth * 2;
#else   // Generic case.
  static constexpr int kSIMDWidth = kGenericSIMDWidth;
#endif  // __ARM_NEON || defined __aarch64__ / __AVX2__

  using GRUStateType = fixed16<kGRUStateBits>;
  using InputType = fixed32<kInputBits>;
  using SampleType = fixed16<kSampleBits>;
  using SampleWeightType = float;
  static constexpr int kInputMantissaBits = InputType::kMantissaBits;
  static constexpr int kSampleMantissaBits = SampleType::kMantissaBits;
  static constexpr int kStateMantissaBits = GRUStateType::kMantissaBits;
  // Generic GRU function covers all uses for WaveRNN-like architectures and
  // conditioning.
  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
            bool kSplitGates = false>
  void GruWithARInput(int start, int end, int state_size,
                      const InputType* gru_recurrent_data,
                      const InputType* input_data, GRUStateType* gru_state_data,
                      const SampleType* ar_sample0 = nullptr,
                      const SampleType* ar_sample1 = nullptr,
                      const SampleWeightType* ar_01_weights = nullptr,
                      int num_replicas = 1, int replica_stride = 0,
                      const SampleType* ar_sample2 = nullptr,
                      const SampleWeightType* ar_2_weights = nullptr,
                      const InputType* gru_recurrent_other_data = nullptr) {
#if defined __ARM_NEON || defined __aarch64__ || defined __AVX2__
    const int32_t* gru_recurrent_ptr =
        reinterpret_cast<const int32_t*>(gru_recurrent_data);
    const int32_t* gru_recurrent_other_ptr =
        reinterpret_cast<const int32_t*>(gru_recurrent_other_data);
    const int32_t* input_ptr = reinterpret_cast<const int32_t*>(input_data);
    int16_t* gru_state_ptr = reinterpret_cast<int16_t*>(gru_state_data);
#if defined __AVX2__
    // The samples are fixed16, but we scale them up here and convert to float
    // so that the product with the QR weights is always on the same scale as
    // InputType, so we don't have to do any more scaling inside.
    const float sample_factor = static_cast<float>(1 << kInputMantissaBits);
#else
    const float sample_factor = 1.0f;
#endif
    // AR sample 0 and 1 are packed into a pair because the QR weights are
    // formatted with the weights interleaved for sample 0 and 1.
    std::pair<float, float> ar_sample01;
    float ar_sample2_float = 0.0f;
    if (kInputsMode == ARInputsMode::k2ARInputs ||
        kInputsMode == ARInputsMode::k3ARInputs) {
      ar_sample01 = {static_cast<float>(*ar_sample0) * sample_factor,
                     static_cast<float>(*ar_sample1) * sample_factor};
      if (kInputsMode == ARInputsMode::k3ARInputs) {
        ar_sample2_float = static_cast<float>(*ar_sample2) * sample_factor;
      }
    }
#if defined __AVX2__
    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
    GruGatesAVXFixed<kInputMantissaBits, kStateMantissaBits, kInputsMode,
                     kSplitGates>(
        start, end, state_size, gru_recurrent_ptr, input_ptr, &ar_sample01,
        ar_01_weights, num_replicas, replica_stride, &ar_sample2_float,
        ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
#else   // ARM.
    DCHECK_EQ(num_replicas, 1) << "ARM code should always have 1 replica";
    GoThroughGatesFixed<GRUStateType, InputType, kInputsMode, kSplitGates>(
        start, end, ar_01_weights, gru_recurrent_ptr, gru_recurrent_other_ptr,
        input_ptr, gru_state_ptr, ar_2_weights, state_size, &ar_sample01,
        &ar_sample2_float);
#endif  // __AVX2__ / ARM.
#else   // Generic case.
    CHECK_EQ(num_replicas, 1) << "Generic code should always have 1 replica";
    GoThroughGates<GRUStateType, InputType, SampleWeightType, SampleType,
                   kInputsMode, kSplitGates>(
        start, end, ar_01_weights, gru_recurrent_data, gru_recurrent_other_data,
        input_data, gru_state_data, ar_2_weights, state_size, ar_sample0,
        ar_sample1, ar_sample2);
#endif  // __ARM_NEON || defined __aarch64__ / __AVX2__
  }
};

}  // namespace csrblocksparse

#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_