Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Running

File size: 11,500 Bytes

d1a84ee

/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_
#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_

#include <algorithm>
#include <type_traits>

#include "sparse_matmul/numerics/fixed_types.h"
#include "sparse_matmul/numerics/float16_types.h"
#include "sparse_matmul/numerics/type_utils.h"

// Separate out the assembly kernels for readability. Eventually this will
// become an ifdef switch on the architecture type.
#if defined __aarch64__
#include "sparse_matmul/compute/kernels_arm.h"
#elif defined __AVX__
#include "sparse_matmul/compute/kernels_avx.h"
#else   // defined __AVX__
// If there is no architecture-specific implementation, then always use generic.
template <typename WeightType, typename RhsType, typename OutType>
struct ShouldEnableGenericSpMV_4x4 : std::true_type {};
template <typename WeightType, typename RhsType, typename OutType>
struct ShouldEnableGenericSpMM5_4x4 : std::true_type {};
template <typename WeightType, typename RhsType, typename OutType>
struct ShouldEnableGenericSpMV_1x1 : std::true_type {};
template <typename WeightType, typename RhsType, typename OutType>
struct ShouldEnableGenericSpMM5_1x1 : std::true_type {};
template <typename Type>
struct ShouldEnableGenericAdd : std::true_type {};
#endif  // defined __arch64__

namespace csrblocksparse {
namespace detail {

// The computational routines do NO error checking for speed.  It is assumed
// that this has been handled by CSRBlockSparseMatrix.

// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
// blocked pattern, x is a vector and b is vector. Weights are stored for this
// routine by making each 4x4 block contiguous. Blocks are ordered in standard
// row-major format. column indices are converted to deltas and then multiplied
// by 2 to convert to bytes, so that the value can be used directly to offset
// the pointer into the rhs vector.
//
// NOTE: The bias is expected to have be multiplied by .25f prior to calling
// this function.  This is automatically taken care of in SparseLinearLayer.
// The bias is reconstructed through horizontal additions, leads to a small
// speedup by reducing latencies at the end of the loop.
template <typename WeightType, typename RhsType, typename OutType>
typename std::enable_if<
    ShouldEnableGenericSpMV_4x4<WeightType, RhsType, OutType>::value>::type
SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
         OutType* out_ptr, int64_t assigned_rows,
         int64_t rows /* only used in SpMM variants */,
         int64_t cols /* only used in SpMM variants */, int relu) {
  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
    float accumulators[4];
    // Undo the divion by the happens for the assembly version.
    for (int i = 0; i < 4; ++i)
      accumulators[i] = 4.f * static_cast<float>(*bias_ptr++);

    int reduced_col_count = *nnz_per_row++;
    for (int c = 0; c < reduced_col_count; ++c) {
      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
      rhs_ptr += col_delta;

      // Multiply this 4x4 block.
      for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
          accumulators[i] += static_cast<float>(*weights_ptr++) *
                             static_cast<float>(rhs_ptr[j]);
        }
      }
    }

    for (int i = 0; i < 4; ++i)
      *out_ptr++ = static_cast<OutType>(relu ? std::max(accumulators[i], 0.f)
                                             : accumulators[i]);
  }
}

// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
// broadcast. Weights are stored for this routine by making each 4x4 block
// contiguous. Blocks are ordered in standard row-major format. column indices
// are converted to deltas and then multiplied by 2 to convert to bytes, so
// that the value can be used directly to offset the pointer into the rhs
// vector.
//
// NOTE: The bias is expected to have be multiplied by .25f prior to calling
// this function.  This is automatically taken care of in SparseLinearLayer.
// The bias is reconstructed through horizontal additions, leads to a small
// speedup by reducing latencies at the end of the loop.
template <typename WeightType, typename RhsType, typename OutType>
typename std::enable_if<
    ShouldEnableGenericSpMM5_4x4<WeightType, RhsType, OutType>::value>::type
SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
          int relu) {
  const RhsType* rhs_ptrs[5];
  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;

  OutType* out_ptrs[5];
  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;

  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
    float accumulators[4][5];
    // Undo the divion by the happens for the assembly version.
    for (int i = 0; i < 4; ++i) {
      for (int k = 0; k < 5; ++k) {
        accumulators[i][k] = 4.f * static_cast<float>(*bias_ptr);
      }
      ++bias_ptr;
    }

    int reduced_col_count = *nnz_per_row++;
    for (int c = 0; c < reduced_col_count; ++c) {
      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;

      // multiply this 4x4 block
      for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
          for (int k = 0; k < 5; ++k) {
            accumulators[i][k] += static_cast<float>(*weights_ptr) *
                                  static_cast<float>(rhs_ptrs[k][j]);
          }
          weights_ptr++;
        }
      }
    }

    for (int k = 0; k < 5; ++k) {
      for (int i = 0; i < 4; ++i) {
        out_ptrs[k][0] = static_cast<OutType>(
            relu ? std::max(accumulators[i][k], 0.f) : accumulators[i][k]);
        out_ptrs[k]++;
      }
    }
  }
}

// Performs the calculation y = A * x + b where A is a sparse matrix with
// a 1x1 blocked pattern (ie unstructured), x is a
// vector and b is vector.
// Weights are stored for this routine in standard CSR format.  Each row must
// have a multiple of 8 columns.
// column indices are converted to deltas and then multiplied by 2 to convert
// to bytes, so that the value can be used directly to offset the pointer
// into the rhs vector.
// NOTE: The bias is expected to have be multiplied by .25f prior to calling
// this function.  This is automatically taken care of in SparseLinearLayer.
// The bias is reconstructed through horizontal additions, leads to a small
// speedup by reducing latencies at the end of the loop.
template <typename WeightType, typename RhsType, typename OutType>
typename std::enable_if<
    ShouldEnableGenericSpMV_1x1<WeightType, RhsType, OutType>::value>::type
SpMV_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
         OutType* out_ptr, int64_t assigned_rows,
         int64_t rows /* only used in SpMM variants */,
         int64_t cols /* only used in SpMM variants */, int relu) {
  for (int row = 0; row < assigned_rows; ++row) {
    // Undo the divion by the happens for the assembly version.
    float accumulator = 4.f * static_cast<float>(*bias_ptr++);

    int col_count = *nnz_per_row++;
    for (int c = 0; c < col_count; ++c) {
      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
      rhs_ptr += col_delta;

      accumulator +=
          static_cast<float>(*weights_ptr++) * static_cast<float>(*rhs_ptr);
    }

    *out_ptr++ =
        static_cast<OutType>(relu ? std::max(accumulator, 0.f) : accumulator);
  }
}

// Performs the calculation y = A * x + b where A is a sparse matrix with
// a 1x1 blocked pattern (ie unstructured), x is a
// vector and b is vector.
// Weights are stored for this routine in standard CSR format.  Each row must
// have a multiple of 8 columns.
// column indices are converted to deltas and then multiplied by 2 to convert
// to bytes, so that the value can be used directly to offset the pointer
// into the rhs vector.
// NOTE: The bias is expected to have be multiplied by .25f prior to calling
// this function.  This is automatically taken care of in SparseLinearLayer.
// The bias is reconstructed through horizontal additions, leads to a small
// speedup by reducing latencies at the end of the loop.
template <typename WeightType, typename RhsType, typename OutType>
typename std::enable_if<
    ShouldEnableGenericSpMM5_1x1<WeightType, RhsType, OutType>::value>::type
SpMM5_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
          int relu) {
  const RhsType* rhs_ptrs[5];
  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;

  OutType* out_ptrs[5];
  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;

  for (int row = 0; row < assigned_rows; ++row) {
    // Undo the divion by the happens for the assembly version.
    float accumulator[5];
    for (int i = 0; i < 5; ++i)
      accumulator[i] = 4.f * static_cast<float>(*bias_ptr);

    ++bias_ptr;

    int col_count = *nnz_per_row++;
    for (int c = 0; c < col_count; ++c) {
      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
      for (int i = 0; i < 5; ++i) {
        rhs_ptrs[i] += col_delta;
        accumulator[i] += static_cast<float>(*weights_ptr) *
                          static_cast<float>(rhs_ptrs[i][0]);
      }
      weights_ptr++;
    }

    for (int i = 0; i < 5; ++i) {
      out_ptrs[i][0] = static_cast<OutType>(relu ? std::max(accumulator[i], 0.f)
                                                 : accumulator[i]);
      out_ptrs[i]++;
    }
  }
}

template <typename Type>
typename std::enable_if<ShouldEnableGenericAdd<Type>::value>::type SumVectors(
    int start, int end, const Type* add1, const Type* add2, Type* result) {
  LOG_FIRST_N(WARNING, 1) << "SumVectors: using generic kernel!";
  for (int i = start; i < end; ++i) {
    Type sum = static_cast<Type>(static_cast<float>(add1[i]) +
                                 static_cast<float>(add2[i]));
    result[i] = sum;
  }
}

}  // namespace detail
}  // namespace csrblocksparse

#undef LABEL_COL_LOOP
#undef LABEL_ROW_LOOP
#undef LABEL_SKIP_COL_LOOP
#undef LABEL_TOP_LOOP

#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_