/******************************************************************************
 * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 * Modifications Copyright (c) 2017-2024, Advanced Micro Devices, Inc.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#ifndef HIPCUB_CUB_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
#define HIPCUB_CUB_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_

#include "../../../config.hpp"
#include "../../../util_deprecated.hpp"

#include <cub/device/device_segmented_reduce.cuh>

BEGIN_HIPCUB_NAMESPACE

struct DeviceSegmentedReduce
{
    template<typename InputIteratorT,
             typename OutputIteratorT,
             typename OffsetIteratorT,
             typename ReductionOp,
             typename T>
    HIPCUB_RUNTIME_FUNCTION static hipError_t Reduce(void*           d_temp_storage,
                                                     size_t&         temp_storage_bytes,
                                                     InputIteratorT  d_in,
                                                     OutputIteratorT d_out,
                                                     int             num_segments,
                                                     OffsetIteratorT d_begin_offsets,
                                                     OffsetIteratorT d_end_offsets,
                                                     ReductionOp     reduction_op,
                                                     T               initial_value,
                                                     hipStream_t     stream = 0)
    {
        return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
                                                                           temp_storage_bytes,
                                                                           d_in,
                                                                           d_out,
                                                                           num_segments,
                                                                           d_begin_offsets,
                                                                           d_end_offsets,
                                                                           reduction_op,
                                                                           initial_value,
                                                                           stream));
    }

    template<typename InputIteratorT,
             typename OutputIteratorT,
             typename OffsetIteratorT,
             typename ReductionOp,
             typename T>
    HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t
        Reduce(void*           d_temp_storage,
               size_t&         temp_storage_bytes,
               InputIteratorT  d_in,
               OutputIteratorT d_out,
               int             num_segments,
               OffsetIteratorT d_begin_offsets,
               OffsetIteratorT d_end_offsets,
               ReductionOp     reduction_op,
               T               initial_value,
               hipStream_t     stream,
               bool            debug_synchronous)
    {
        HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS();
        return Reduce(d_temp_storage,
                      temp_storage_bytes,
                      d_in,
                      d_out,
                      num_segments,
                      d_begin_offsets,
                      d_end_offsets,
                      reduction_op,
                      initial_value,
                      stream);
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void*           d_temp_storage,
                                                  size_t&         temp_storage_bytes,
                                                  InputIteratorT  d_in,
                                                  OutputIteratorT d_out,
                                                  int             num_segments,
                                                  OffsetIteratorT d_begin_offsets,
                                                  OffsetIteratorT d_end_offsets,
                                                  hipStream_t     stream = 0)
    {
        return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Sum(d_temp_storage,
                                                                        temp_storage_bytes,
                                                                        d_in,
                                                                        d_out,
                                                                        num_segments,
                                                                        d_begin_offsets,
                                                                        d_end_offsets,
                                                                        stream));
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t
        Sum(void*           d_temp_storage,
            size_t&         temp_storage_bytes,
            InputIteratorT  d_in,
            OutputIteratorT d_out,
            int             num_segments,
            OffsetIteratorT d_begin_offsets,
            OffsetIteratorT d_end_offsets,
            hipStream_t     stream,
            bool            debug_synchronous)
    {
        HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS();
        return Sum(d_temp_storage,
                   temp_storage_bytes,
                   d_in,
                   d_out,
                   num_segments,
                   d_begin_offsets,
                   d_end_offsets,
                   stream);
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void*           d_temp_storage,
                                                  size_t&         temp_storage_bytes,
                                                  InputIteratorT  d_in,
                                                  OutputIteratorT d_out,
                                                  int             num_segments,
                                                  OffsetIteratorT d_begin_offsets,
                                                  OffsetIteratorT d_end_offsets,
                                                  hipStream_t     stream = 0)
    {
        return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Min(d_temp_storage,
                                                                        temp_storage_bytes,
                                                                        d_in,
                                                                        d_out,
                                                                        num_segments,
                                                                        d_begin_offsets,
                                                                        d_end_offsets,
                                                                        stream));
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t
        Min(void*           d_temp_storage,
            size_t&         temp_storage_bytes,
            InputIteratorT  d_in,
            OutputIteratorT d_out,
            int             num_segments,
            OffsetIteratorT d_begin_offsets,
            OffsetIteratorT d_end_offsets,
            hipStream_t     stream,
            bool            debug_synchronous)
    {
        HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS();
        return Min(d_temp_storage,
                   temp_storage_bytes,
                   d_in,
                   d_out,
                   num_segments,
                   d_begin_offsets,
                   d_end_offsets,
                   stream);
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void*           d_temp_storage,
                                                     size_t&         temp_storage_bytes,
                                                     InputIteratorT  d_in,
                                                     OutputIteratorT d_out,
                                                     int             num_segments,
                                                     OffsetIteratorT d_begin_offsets,
                                                     OffsetIteratorT d_end_offsets,
                                                     hipStream_t     stream = 0)
    {
        return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::ArgMin(d_temp_storage,
                                                                           temp_storage_bytes,
                                                                           d_in,
                                                                           d_out,
                                                                           num_segments,
                                                                           d_begin_offsets,
                                                                           d_end_offsets,
                                                                           stream));
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t
        ArgMin(void*           d_temp_storage,
               size_t&         temp_storage_bytes,
               InputIteratorT  d_in,
               OutputIteratorT d_out,
               int             num_segments,
               OffsetIteratorT d_begin_offsets,
               OffsetIteratorT d_end_offsets,
               hipStream_t     stream,
               bool            debug_synchronous)
    {
        HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS();
        return ArgMin(d_temp_storage,
                      temp_storage_bytes,
                      d_in,
                      d_out,
                      num_segments,
                      d_begin_offsets,
                      d_end_offsets,
                      stream);
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void*           d_temp_storage,
                                                  size_t&         temp_storage_bytes,
                                                  InputIteratorT  d_in,
                                                  OutputIteratorT d_out,
                                                  int             num_segments,
                                                  OffsetIteratorT d_begin_offsets,
                                                  OffsetIteratorT d_end_offsets,
                                                  hipStream_t     stream = 0)
    {
        return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::Max(d_temp_storage,
                                                                        temp_storage_bytes,
                                                                        d_in,
                                                                        d_out,
                                                                        num_segments,
                                                                        d_begin_offsets,
                                                                        d_end_offsets,
                                                                        stream));
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t
        Max(void*           d_temp_storage,
            size_t&         temp_storage_bytes,
            InputIteratorT  d_in,
            OutputIteratorT d_out,
            int             num_segments,
            OffsetIteratorT d_begin_offsets,
            OffsetIteratorT d_end_offsets,
            hipStream_t     stream,
            bool            debug_synchronous)
    {
        HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS();
        return Max(d_temp_storage,
                   temp_storage_bytes,
                   d_in,
                   d_out,
                   num_segments,
                   d_begin_offsets,
                   d_end_offsets,
                   stream);
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void*           d_temp_storage,
                                                     size_t&         temp_storage_bytes,
                                                     InputIteratorT  d_in,
                                                     OutputIteratorT d_out,
                                                     int             num_segments,
                                                     OffsetIteratorT d_begin_offsets,
                                                     OffsetIteratorT d_end_offsets,
                                                     hipStream_t     stream = 0)
    {
        return hipCUDAErrorTohipError(::cub::DeviceSegmentedReduce::ArgMax(d_temp_storage,
                                                                           temp_storage_bytes,
                                                                           d_in,
                                                                           d_out,
                                                                           num_segments,
                                                                           d_begin_offsets,
                                                                           d_end_offsets,
                                                                           stream));
    }

    template<typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
    HIPCUB_DETAIL_DEPRECATED_DEBUG_SYNCHRONOUS HIPCUB_RUNTIME_FUNCTION static hipError_t
        ArgMax(void*           d_temp_storage,
               size_t&         temp_storage_bytes,
               InputIteratorT  d_in,
               OutputIteratorT d_out,
               int             num_segments,
               OffsetIteratorT d_begin_offsets,
               OffsetIteratorT d_end_offsets,
               hipStream_t     stream,
               bool            debug_synchronous)
    {
        HIPCUB_DETAIL_RUNTIME_LOG_DEBUG_SYNCHRONOUS();
        return ArgMax(d_temp_storage,
                      temp_storage_bytes,
                      d_in,
                      d_out,
                      num_segments,
                      d_begin_offsets,
                      d_end_offsets,
                      stream);
    }
};

END_HIPCUB_NAMESPACE

#endif // HIPCUB_CUB_DEVICE_DEVICE_SEGMENTED_REDUCE_HPP_
