Crop and Resize images using CV_CUDA

Maybe I’m using it incorrectly. Directly calling the C++ OpenCV api is much faster than using CV_CUDA.

#include <common/NvDecoder.h>
#include <common/TestUtils.h>
#include <cuda_runtime_api.h>
#include <cvcuda/OpCustomCrop.hpp>
#include <cvcuda/OpResize.hpp>
#include <getopt.h>
#include <cmath>
#include <opencv2/opencv.hpp>
#include <nvcv/Image.hpp>
#include <nvcv/Tensor.hpp>

#include <chrono>
using namespace std;
using namespace chrono;
 * @brief Crop and Resize sample app.
 * The Crop and Resize is a simple pipeline which demonstrates usage of
 * CVCuda Tensor along with a few operators.
 * Input Batch Tensor -> Crop -> Resize -> WriteImage

 * @brief Utility to show usage of sample app
void showUsage()
    std::cout << "usage: ./nvcv_cropandresize_app -i <image file path or image directory -b <batch size>" << std::endl;

 * @brief Utility to parse the command line arguments
int ParseArgs(int argc, char *argv[], std::string &imagePath, uint32_t &batchSize)
    static struct option long_options[] = {
        { "help", no_argument, 0, 'h'},
        {"imagePath", required_argument, 0, 'i'},
        { "batch", required_argument, 0, 'b'},
        {0, 0, 0, 0}

    int long_index = 0;
    int opt = 0;
    while ((opt = getopt_long(argc, argv, "hi:b:", long_options, & amp;long_index)) != -1)
        switch (opt)
        case 'h':
            return -1;
        case 'i':
            imagePath = optarg;
        case 'b':
            batchSize = std::stoi(optarg);
        case ':':
            return -1;
    std::ifstream imageFile(imagePath);
    if (!imageFile.good())
        std::cerr << "Image path '" + imagePath + "' does not exist\\
        return -1;
    return 0;

int main(int argc, char *argv[])
    //Default parameters
    std::string imagePath = "test.jpg";
    uint32_t batchSize = 1;
    cv::Mat imgMat = cv::imread(imagePath);

    // Parse the command line paramaters to override the default parameters
    int retval = ParseArgs(argc, argv, imagePath, batchSize);
    if (retval != 0)
        return retval;

    // NvJpeg is used to decode the images to the color format required.
    // Since we need a contiguous buffer for batched input, a buffer is
    // preallocated based on the maximum image dimensions and batch size
    // for NvJpeg to write into.

    // Note: The maximum input image dimensions needs to be updated in case
    // of testing with different test images

    int maxImageWidth = 1920;
    int maxImageHeight = 1080;
    int maxChannels = 3;

    // tag: Create the cuda stream
    cudaStream_t stream;
    CHECK_CUDA_ERROR(cudaStreamCreate( & amp;stream));

    // tag: Allocate input tensor
    // Allocating memory for RGBI input image batch of uint8_t data type
    // without padding since NvDecode utility currently doesn't support
    // Padded buffers.

    nvcv::TensorDataStridedCuda::Buffer inBuf;
    inBuf.strides[3] = sizeof(uint8_t);
    inBuf.strides[2] = maxChannels * inBuf.strides[3];
    inBuf.strides[1] = maxImageWidth * inBuf.strides[2];
    inBuf.strides[0] = maxImageHeight * inBuf.strides[1];
    CHECK_CUDA_ERROR(cudaMallocAsync( & amp;inBuf.basePtr, batchSize * inBuf.strides[0], stream));

    // tag: Tensor Requirements
    // Calculate the requirements for the RGBI uint8_t Tensor which include
    // pitch bytes, alignment, shape and tensor layout
    nvcv::Tensor::Requirements inReqs
        = nvcv::Tensor::CalcRequirements(batchSize, {maxImageWidth, maxImageHeight}, nvcv::FMT_RGB8);

    // Create a tensor buffer to store the data pointer and pitch bytes for each plane
    nvcv::TensorDataStridedCuda inData(nvcv::TensorShape{inReqs.shape, inReqs.rank, inReqs.layout},
                                       nvcv::DataType{inReqs.dtype}, inBuf);

    // TensorWrapData allows for interoperation of external tensor representations with CVCUDA Tensor.
    nvcv::Tensor inTensor = nvcv::TensorWrapData(inData);

    // tag: Image Loading
    // NvJpeg is used to load the images to create a batched input device buffer.
    uint8_t *gpuInput = reinterpret_cast<uint8_t *>(inBuf.basePtr);
    CHECK_CUDA_ERROR(cudaMemcpyAsync(gpuInput,, inBuf.strides[0], cudaMemcpyHostToDevice));
    // The total images is set to the same value as batch size for testing
    uint32_t totalImages = batchSize;
    // Format in which the decoded output will be saved
    //nvjpegOutputFormat_t outputFormat = NVJPEG_OUTPUT_RGBI;

    //NvDecode(imagePath, batchSize, totalImages, outputFormat, gpuInput);

    // tag: The input buffer is now ready to be used by the operators

    // Set parameters for Crop and Resize
    // ROI dimensions to crop in the input image
    int cropX = 150;
    int cropY = 50;
    int cropWidth = 800;
    int cropHeight = 1000;

    // Set the resize dimensions
    int resizeWidth = 1600;
    int resizeHeight = 2000;

    // Initialize the CVCUDA ROI struct
    NVCVRectI crpRect = {cropX, cropY, cropWidth, cropHeight};

    cv::Rect Rect(cropX, cropY, cropWidth, cropHeight);

    auto t1=std::chrono::steady_clock::now();
    // Crop image
    cv::Mat cropImg = imgMat(Rect);

    //Resize image
    cv::resize(cropImg, cropImg, cv::Size(resizeWidth, resizeHeight));
    auto t2=std::chrono::steady_clock::now();
    double dr_ms=std::chrono::duration<double,std::milli>(t2-t1).count();
    std::cout << "opencv costs: " << dr_ms << "ms" << std::endl;

    // tag: Allocate Tensors for Crop and Resize
    // Create a CVCUDA Tensor based on the crop window size.
    nvcv::Tensor cropTensor(batchSize, {cropWidth, cropHeight}, nvcv::FMT_RGB8);
    // Create a CVCUDA Tensor based on resize dimensions
    nvcv::Tensor resizedTensor(batchSize, {resizeWidth, resizeHeight}, nvcv::FMT_RGB8);

    // tag: Initialize operators for Crop and Resize
    cvcuda::CustomCrop cropOp;
    cvcuda::Resize resizeOp;

    cudaEvent_t start, stop;
    cudaEventCreate( & amp;start);
    cudaEventCreate( & amp;stop);
    // tag: Executes the CustomCrop operation on the given cuda stream
    cropOp(stream, inTensor, cropTensor, crpRect);

    // Resize operator can now be enqueued into the same stream
    resizeOp(stream, cropTensor, resizedTensor, NVCV_INTERP_LINEAR);

    // tag: Profile section

    float operators = 0;
    cudaEventElapsedTime( & amp;operatorms, start, stop);
    std::cout << "Time for Crop and Resize : " << operatorms << " ms" << std::endl;

    // tag: Copy the buffer to CPU and write resized image into .bmp file
    WriteRGBITensor(resizedTensor, stream);

    // tag: Clean up

    // tag: End of Sample


opencv costs: 3.16336ms
Time for Crop and Resize: 200.148 ms
Writing to ./cvcudatest_0.jpg 4800 1600 2000