/*******************************************************************************
 * Copyright 2020 Intel Corporation.
 *
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them is governed by
 * the express license under which they were provided to you ('License'). Unless the License provides otherwise,
 * you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related
 * documents without Intel's prior written permission.
 * This software and the related documents are provided as is, with no express or implied warranties, other than
 * those that are expressly stated in the License.
 *******************************************************************************/
#include "pifilils_t.h"

// singlethreaded
static void ils_s_DiffHR_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.width--;
    ippiSub_32f_C1R(pSrc, srcStep, pSrc + 1, srcStep, pDst, dstStep, roiSize1);
    roiSize1.width = 1;
    ippiSub_32f_C1R(pSrc + roiSize.width - 1, srcStep, pSrc, srcStep, pDst + roiSize.width - 1, dstStep, roiSize1);
}
static void ils_s_MU_C1(Ipp32f *pSrc, int srcStep, Ipp32f c, Ipp32f p, Ipp32f gamma, Ipp32f eps, IppiFilterILSType filterType, Ipp32f *pDst,
                        int dstStep, IppiSize roiSize)
{
    int j;
    if (filterType == ippiFilterILS_Norm) {
        /*d = c * s - p * s*pow((s*s + eps), gamma);*/
        for (j = 0; j < roiSize.height; j++) {
            Ipp32f *src = (Ipp32f *)((Ipp8u *)pSrc + j * srcStep);
            Ipp32f *dst = (Ipp32f *)((Ipp8u *)pDst + j * dstStep);
            ippsSqr_32f(src, dst, roiSize.width);
            ippsAddC_32f_I(eps, dst, roiSize.width);
            ippsPowx_32f_A21(dst, gamma, dst, roiSize.width);
            ippsMulC_32f_I(-p, dst, roiSize.width);
            ippsAddC_32f_I(c, dst, roiSize.width);
            ippsMul_32f_I(src, dst, roiSize.width);
        }
    } else { // ippiFilterILS_Welsch
        /*d= c * s - 2 * s*exp((s*s * eps));*/
        eps = -1.0 / (2 * gamma * gamma);
        c = 2.0;
        for (j = 0; j < roiSize.height; j++) {
            Ipp32f *src = (Ipp32f *)((Ipp8u *)pSrc + j * srcStep);
            Ipp32f *dst = (Ipp32f *)((Ipp8u *)pDst + j * dstStep);
            ippsSqr_32f(src, dst, roiSize.width);
            ippsMulC_32f_I(eps, dst, roiSize.width);

            ippsExp_32f_I(dst, roiSize.width);
            ippsMulC_32f_I(-2.0, dst, roiSize.width);
            ippsAddC_32f_I(c, dst, roiSize.width);
            ippsMul_32f_I(src, dst, roiSize.width);
        }
    }
}
static void ils_s_DiffHL_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.width--;
    ippiSub_32f_C1R(pSrc + 1, srcStep, pSrc, srcStep, pDst + 1, dstStep, roiSize1);
    roiSize1.width = 1;
    ippiSub_32f_C1R(pSrc, srcStep, pSrc + roiSize.width - 1, srcStep, pDst, dstStep, roiSize1);
}
static void ils_s_DiffVU_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.height--;
    Ipp32f *pSrc1 = (Ipp32f *)((Ipp8u *)pSrc + srcStep);
    ippiSub_32f_C1R(pSrc, srcStep, pSrc1, srcStep, pDst, dstStep, roiSize1);
    roiSize1 = roiSize;
    roiSize1.height = 1;
    pSrc1 = (Ipp32f *)((Ipp8u *)pSrc + srcStep * (roiSize.height - 1));
    pDst = (Ipp32f *)((Ipp8u *)pDst + dstStep * (roiSize.height - 1));
    ippiSub_32f_C1R(pSrc1, srcStep, pSrc, srcStep, pDst, dstStep, roiSize1);
}
static void ils_s_DiffVD_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.height = 1;
    Ipp32f *pSrc1 = (Ipp32f *)((Ipp8u *)pSrc + srcStep * (roiSize.height - 1));
    ippiSub_32f_C1R(pSrc, srcStep, pSrc1, srcStep, pDst, dstStep, roiSize1);
    roiSize1 = roiSize;
    roiSize1.height--;
    pSrc1 = (Ipp32f *)((Ipp8u *)pSrc + srcStep);
    pDst = (Ipp32f *)((Ipp8u *)pDst + dstStep);
    ippiSub_32f_C1R(pSrc1, srcStep, pSrc, srcStep, pDst, dstStep, roiSize1);
}
static void ils_s_MulPack_32f_C1R(const Ipp32f *pS1, int s1Step, const Ipp32f *pS2, int s2Step, Ipp32f *pD, int dStep, IppiSize roiSize)
{
    const Ipp32f *pS11, *pS21;
    Ipp32f *pD1;
    Ipp32fc *s01, *s02, *d0, *s11, *s12, *d1;
    int i, j, n, k;
    int w = roiSize.width;

    if (roiSize.height & 1) {
        k = roiSize.height - 1;
    } else {
        k = roiSize.height - 2;
    }
    pD[0] = pS1[0] * pS2[0];
    if (w & 1) {
        n = (w - 1) >> 1;
    } else {
        pD[w - 1] = pS1[w - 1] * pS2[w - 1];
        n = (w - 2) >> 1;
    }
    s01 = (Ipp32fc *)(pS1 + 1);
    s02 = (Ipp32fc *)(pS2 + 1);
    d0 = (Ipp32fc *)(pD + 1);
    for (i = 0; i < n; i++) {
        d0[i].re = MULRE(s01[i], s02[i]);
        d0[i].im = MULIM(s01[i], s02[i]);
    }
    pS1 = (Ipp32f *)((Ipp8u *)pS1 + s1Step);
    pS2 = (Ipp32f *)((Ipp8u *)pS2 + s2Step);
    pD = (Ipp32f *)((Ipp8u *)pD + dStep);
    pS11 = (Ipp32f *)((Ipp8u *)pS1 + s1Step);
    pS21 = (Ipp32f *)((Ipp8u *)pS2 + s2Step);
    pD1 = (Ipp32f *)((Ipp8u *)pD + dStep);

    for (j = 1; j < k; j += 2) {
        // left col
        pD[0] = pS1[0] * pS2[0] - pS11[0] * pS21[0];
        pD1[0] = pS1[0] * pS21[0] + pS11[0] * pS2[0];
        if (!(w & 1)) {
            pD[w - 1] = pS1[w - 1] * pS2[w - 1] - pS11[w - 1] * pS21[w - 1];
            pD1[w - 1] = pS1[w - 1] * pS21[w - 1] + pS11[w - 1] * pS2[w - 1];
        }
        // central part
        ippsMul_32fc((Ipp32fc *)(pS1 + 1), (Ipp32fc *)(pS2 + 1), (Ipp32fc *)(pD + 1), n);
        ippsMul_32fc((Ipp32fc *)(pS11 + 1), (Ipp32fc *)(pS21 + 1), (Ipp32fc *)(pD1 + 1), n);

        pS1 = (Ipp32f *)((Ipp8u *)pS1 + 2 * s1Step);
        pS2 = (Ipp32f *)((Ipp8u *)pS2 + 2 * s2Step);
        pD = (Ipp32f *)((Ipp8u *)pD + 2 * dStep);
        pS11 = (Ipp32f *)((Ipp8u *)pS11 + 2 * s1Step);
        pS21 = (Ipp32f *)((Ipp8u *)pS21 + 2 * s2Step);
        pD1 = (Ipp32f *)((Ipp8u *)pD1 + 2 * dStep);
    }
    if (!(roiSize.height & 1)) {
        pD[0] = pS1[0] * pS2[0];
        if (!(w & 1)) {
            pD[w - 1] = pS1[w - 1] * pS2[w - 1];
        }
        s01 = (Ipp32fc *)(pS1 + 1);
        s02 = (Ipp32fc *)(pS2 + 1);
        d0 = (Ipp32fc *)(pD + 1);
        ippsMul_32fc(s01, s02, d0, n);
    }
}

// multithreaded
static IppStatus diff_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    IppiSize roi;
    roi.width = info->roi.width;
    roi.height = info->threadLen;
    Ipp32f *curSrc1 = (Ipp32f *)((char *)info->pSrc1 + t * info->threadLen * info->src1Step);
    Ipp32f *curSrc2 = (Ipp32f *)((char *)info->pSrc2 + t * info->threadLen * info->src1Step);
    Ipp32f *curDst = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);

    if (t == info->nThreads - 1)
        roi.height += info->tail;
    if (t >= info->nThreads)
        roi.height = 0;
    ippiSub_32f_C1R(curSrc1, info->src1Step, curSrc2, info->src1Step, curDst, info->dstStep, roi);
    return ippStsOk;
}
static IppStatus add_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    IppiSize roi;
    roi.width = info->roi.width;
    roi.height = info->threadLen;
    Ipp32f *curSrc1 = (Ipp32f *)((char *)info->pSrc1 + t * info->threadLen * info->src1Step);
    Ipp32f *curDst = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);

    if (t == info->nThreads - 1)
        roi.height += info->tail;
    if (t >= info->nThreads)
        roi.height = 0;
    ippiAdd_32f_C1IR(curSrc1, info->src1Step, curDst, info->dstStep, roi);
    return ippStsOk;
}
static IppStatus mulc_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    IppiSize roi;
    roi.width = info->roi.width;
    roi.height = info->threadLen;
    Ipp32f *curDst = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);

    if (t == info->nThreads - 1)
        roi.height += info->tail;
    if (t >= info->nThreads)
        roi.height = 0;
    ippiMulC_32f_C1IR(info->val, curDst, info->dstStep, roi);
    return ippStsOk;
}
static IppStatus mu_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    IppiSize roi;
    roi.width = info->roi.width;
    roi.height = info->threadLen;
    Ipp32f *curSrc = (Ipp32f *)((char *)info->pSrc1 + t * info->threadLen * info->src1Step);
    Ipp32f *curDst = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);

    if (t == info->nThreads - 1)
        roi.height += info->tail;
    if (t >= info->nThreads)
        roi.height = 0;
    ils_s_MU_C1(curSrc, info->src1Step, info->c, info->p, info->gamma, info->eps, info->filter, curDst, info->dstStep, roi);
    return ippStsOk;
}
static IppStatus mulpack_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    IppiSize roi;
    roi.width = info->roi.width;
    roi.height = info->threadLen;
    Ipp32f *pS1 = (Ipp32f *)((char *)info->pSrc1 + t * info->threadLen * info->src1Step);
    Ipp32f *pS2 = (Ipp32f *)((char *)info->pSrc2 + t * info->threadLen * info->src2Step);
    Ipp32f *pD = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);

    if (t == info->nThreads - 1)
        roi.height += info->tail;
    if (t >= info->nThreads)
        roi.height = 0;
    int s1Step = info->src1Step;
    int s2Step = info->src2Step;
    int dStep = info->dstStep;
    Ipp32f *pS11 = (Ipp32f *)((Ipp8u *)pS1 + s1Step);
    Ipp32f *pS21 = (Ipp32f *)((Ipp8u *)pS2 + s2Step);
    Ipp32f *pD1 = (Ipp32f *)((Ipp8u *)pD + dStep);
    int w = info->roi.width;
    for (j = 0; j < roi.height; j += 2) {
        // left col
        pD[0] = pS1[0] * pS2[0] - pS11[0] * pS21[0];
        pD1[0] = pS1[0] * pS21[0] + pS11[0] * pS2[0];
        if (!(w & 1)) {
            pD[w - 1] = pS1[w - 1] * pS2[w - 1] - pS11[w - 1] * pS21[w - 1];
            pD1[w - 1] = pS1[w - 1] * pS21[w - 1] + pS11[w - 1] * pS2[w - 1];
        }
        // central part
        ippsMul_32fc((Ipp32fc *)(pS1 + 1), (Ipp32fc *)(pS2 + 1), (Ipp32fc *)(pD + 1), info->n);
        ippsMul_32fc((Ipp32fc *)(pS11 + 1), (Ipp32fc *)(pS21 + 1), (Ipp32fc *)(pD1 + 1), info->n);

        pS1 = (Ipp32f *)((Ipp8u *)pS1 + 2 * s1Step);
        pS2 = (Ipp32f *)((Ipp8u *)pS2 + 2 * s2Step);
        pD = (Ipp32f *)((Ipp8u *)pD + 2 * dStep);
        pS11 = (Ipp32f *)((Ipp8u *)pS11 + 2 * s1Step);
        pS21 = (Ipp32f *)((Ipp8u *)pS21 + 2 * s2Step);
        pD1 = (Ipp32f *)((Ipp8u *)pD1 + 2 * dStep);
    }
    return ippStsOk;
}
static void ils_t_DiffHR_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize, int numThreads)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.width--;
    {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.pSrc2 = pSrc + 1;
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.roi = roiSize1;
        info.nThreads = numThreads;
        info.threadLen = roiSize1.height / numThreads;
        info.tail = roiSize1.height % numThreads;
        ippParallelFor_T(numThreads, (void *)&info, diff_t_fun);
    }
    roiSize1.width = 1;
    ippiSub_32f_C1R(pSrc + roiSize.width - 1, srcStep, pSrc, srcStep, pDst + roiSize.width - 1, dstStep, roiSize1);
}
static void ils_t_MU_C1(Ipp32f *pSrc, int srcStep, Ipp32f c, Ipp32f p, Ipp32f gamma, Ipp32f eps, IppiFilterILSType filter, Ipp32f *pDst, int dstStep,
                        IppiSize roiSize, int numThreads)
{
    ils_info_t info;
    info.pSrc1 = pSrc;
    info.src1Step = srcStep;
    info.pDst = pDst;
    info.dstStep = dstStep;
    info.roi = roiSize;
    info.nThreads = numThreads;
    info.threadLen = roiSize.height / numThreads;
    info.tail = roiSize.height % numThreads;
    info.c = c;
    info.p = p;
    info.gamma = gamma;
    info.eps = eps;
    info.filter = filter;
    ippParallelFor_T(numThreads, (void *)&info, mu_t_fun);
}
static void ils_t_DiffHL_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize, int numThreads)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.width--;
    {
        ils_info_t info;
        info.pSrc1 = pSrc + 1;
        info.pSrc2 = pSrc;
        info.src1Step = srcStep;
        info.pDst = pDst + 1;
        info.dstStep = dstStep;
        info.roi = roiSize1;
        info.nThreads = numThreads;
        info.threadLen = roiSize1.height / numThreads;
        info.tail = roiSize1.height % numThreads;
        ippParallelFor_T(numThreads, (void *)&info, diff_t_fun);
    }
    roiSize1.width = 1;
    ippiSub_32f_C1R(pSrc, srcStep, pSrc + roiSize.width - 1, srcStep, pDst, dstStep, roiSize1);
}
static void ils_t_DiffVU_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize, int numThreads)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.height--;
    {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.pSrc2 = (Ipp32f *)((Ipp8u *)pSrc + srcStep);
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.roi = roiSize1;
        info.nThreads = numThreads;
        info.threadLen = roiSize1.height / numThreads;
        info.tail = roiSize1.height % numThreads;
        ippParallelFor_T(numThreads, (void *)&info, diff_t_fun);
    }

    roiSize1 = roiSize;
    roiSize1.height = 1;
    Ipp32f *pSrc1 = (Ipp32f *)((Ipp8u *)pSrc + srcStep * (roiSize.height - 1));
    pDst = (Ipp32f *)((Ipp8u *)pDst + dstStep * (roiSize.height - 1));
    ippiSub_32f_C1R(pSrc1, srcStep, pSrc, srcStep, pDst, dstStep, roiSize1);
}
static void ils_t_DiffVD_C1(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IppiSize roiSize, int numThreads)
{
    IppiSize roiSize1;
    roiSize1 = roiSize;
    roiSize1.height = 1;
    {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.pSrc2 = (Ipp32f *)((Ipp8u *)pSrc + srcStep * (roiSize.height - 1));
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.roi = roiSize1;
        info.nThreads = numThreads;
        info.threadLen = roiSize1.height / numThreads;
        info.tail = roiSize1.height % numThreads;
        ippParallelFor_T(numThreads, (void *)&info, diff_t_fun);
    }
    roiSize1 = roiSize;
    roiSize1.height--;
    Ipp32f *pSrc1 = (Ipp32f *)((Ipp8u *)pSrc + srcStep);
    pDst = (Ipp32f *)((Ipp8u *)pDst + dstStep);
    ippiSub_32f_C1R(pSrc1, srcStep, pSrc, srcStep, pDst, dstStep, roiSize1);
}
static void ils_t_Add_C1I(Ipp32f *pSrc, int srcStep, Ipp32f *pSrcDst, int srcDstStep, IppiSize roiSize, int numThreads)
{
    ils_info_t info;
    info.pSrc1 = pSrc;
    info.src1Step = srcStep;
    info.pDst = pSrcDst;
    info.dstStep = srcDstStep;
    info.roi = roiSize;
    info.nThreads = numThreads;
    info.threadLen = roiSize.height / numThreads;
    info.tail = roiSize.height % numThreads;
    ippParallelFor_T(numThreads, (void *)&info, add_t_fun);
}
static void ils_t_MulC_C1I(Ipp32f val, Ipp32f *pSrcDst, int srcDstStep, IppiSize roiSize, int numThreads)
{
    ils_info_t info;
    info.val = val;
    info.pDst = pSrcDst;
    info.dstStep = srcDstStep;
    info.roi = roiSize;
    info.nThreads = numThreads;
    info.threadLen = roiSize.height / numThreads;
    info.tail = roiSize.height % numThreads;
    ippParallelFor_T(numThreads, (void *)&info, mulc_t_fun);
}
static void ils_t_MulPack_32f_C1R(Ipp32f *pSrc1, int s1Step, Ipp32f *pSrc2, int s2Step, Ipp32f *pDst, int dStep, IppiSize roiSize, int numThreads)
{

    Ipp32f *pS1 = pSrc1;
    Ipp32f *pS2 = pSrc2;
    Ipp32f *pD = pDst;
    const Ipp32f *pS11, *pS21;
    Ipp32f *pD1;
    Ipp32fc *s01, *s02, *d0, *s11, *s12, *d1;
    int i, j, n, k;
    int w = roiSize.width;

    if (roiSize.height & 1) {
        k = roiSize.height - 1;
    } else {
        k = roiSize.height - 2;
    }
    pD[0] = pS1[0] * pS2[0];
    if (w & 1) {
        n = (w - 1) >> 1;
    } else {
        pD[w - 1] = pS1[w - 1] * pS2[w - 1];
        n = (w - 2) >> 1;
    }
    s01 = (Ipp32fc *)(pS1 + 1);
    s02 = (Ipp32fc *)(pS2 + 1);
    d0 = (Ipp32fc *)(pD + 1);
    for (i = 0; i < n; i++) {
        d0[i].re = MULRE(s01[i], s02[i]);
        d0[i].im = MULIM(s01[i], s02[i]);
    }
    pS1 = (Ipp32f *)((Ipp8u *)pS1 + s1Step);
    pS2 = (Ipp32f *)((Ipp8u *)pS2 + s2Step);
    pD = (Ipp32f *)((Ipp8u *)pD + dStep);
    pS11 = (Ipp32f *)((Ipp8u *)pS1 + s1Step);
    pS21 = (Ipp32f *)((Ipp8u *)pS2 + s2Step);
    pD1 = (Ipp32f *)((Ipp8u *)pD + dStep);
    {
        ils_info_t info;
        info.pSrc1 = pS1;
        info.src1Step = s1Step;
        info.pSrc2 = pS2;
        info.src2Step = s2Step;
        info.pDst = pD;
        info.dstStep = dStep;
        info.roi = roiSize;
        info.nThreads = numThreads;

        info.threadLen = ((k >> 1) / numThreads) << 1;
        info.tail = k - info.threadLen * numThreads;
        info.n = n;
        ippParallelFor_T(numThreads, (void *)&info, mulpack_t_fun);
    }
    if (!(roiSize.height & 1)) {
        pS1 = (Ipp32f *)((Ipp8u *)pSrc1 + (roiSize.height - 1) * s1Step);
        pS2 = (Ipp32f *)((Ipp8u *)pSrc2 + (roiSize.height - 1) * s2Step);
        pD = (Ipp32f *)((Ipp8u *)pDst + (roiSize.height - 1) * dStep);

        pD[0] = pS1[0] * pS2[0];
        if (!(w & 1)) {
            pD[w - 1] = pS1[w - 1] * pS2[w - 1];
        }
        s01 = (Ipp32fc *)(pS1 + 1);
        s02 = (Ipp32fc *)(pS2 + 1);
        d0 = (Ipp32fc *)(pD + 1);
        ippsMul_32fc(s01, s02, d0, n);
    }
}
static void ils_cDftMerge_32fc(Ipp32fc *x, int stepX, Ipp32fc *y, int stepY, int len)
{
    IppiSize roiSize;
    roiSize.width = 8;
    roiSize.height = len;
    ippiTranspose_32fc_C1R(x, stepX, y, stepY, roiSize);
}
static void ils_cDftSplit_32fc(Ipp32fc *x, int stepX, Ipp32fc *y, int stepY, int len)
{
    IppiSize roiSize;
    roiSize.width = len;
    roiSize.height = 8;
    ippiTranspose_32fc_C1R(y, stepY, x, stepX, roiSize);
}

static IppStatus ils_t_DFTGetSize_R_32f(IppiSize roiSize, int flag, IppHintAlgorithm hint, int *pSizeSpec, int *pSizeInit, int *pSizeBuf)
{
    IppStatus res = ippStsNoErr;
    int lenX, lenY;
    int bufSize_R_X, bufSize_R_Y, bufSize_C_Y;
    int sizeSpec = 0;
    int sizeInit = 0;
    int sizeBuf = 0;

    *pSizeSpec = IPP_ALIGNED_SIZE(sizeof(IlsDFTContext_32f), 64);
    *pSizeInit = 0;

    lenX = roiSize.width;
    lenY = roiSize.height;

    res = ippsDFTGetSize_R_32f(lenX, flag, hint, &sizeSpec, &sizeInit, &sizeBuf);
    if (res != ippStsNoErr)
        return res;

    *pSizeSpec += IPP_ALIGNED_SIZE(sizeSpec, 64);
    *pSizeInit += IPP_ALIGNED_SIZE(sizeInit, 64);

    bufSize_R_X = sizeBuf;

    if (lenX != lenY) {
        res = ippsDFTGetSize_R_32f(lenY, flag, hint, &sizeSpec, &sizeInit, &sizeBuf);
        if (res != ippStsNoErr)
            return res;

        *pSizeSpec += IPP_ALIGNED_SIZE(sizeSpec, 64);
        *pSizeInit += IPP_ALIGNED_SIZE(sizeInit, 64);

        bufSize_R_Y = sizeBuf;
    } else {
        bufSize_R_Y = bufSize_R_X;
    }

    res = ippsDFTGetSize_C_32fc(lenY, flag, hint, &sizeSpec, &sizeInit, &sizeBuf);
    if (res != ippStsNoErr)
        return res;

    *pSizeSpec += IPP_ALIGNED_SIZE(sizeSpec, 64);
    *pSizeInit += IPP_ALIGNED_SIZE(sizeInit, 64);

    bufSize_C_Y = sizeBuf;

    *pSizeBuf = IPP_ALIGNED_SIZE(
        IPP_MAX(4 * lenX * sizeof(Ipp32f) + bufSize_R_X, IPP_MAX(4 * lenY * sizeof(Ipp32f) + bufSize_R_Y, 16 * lenY * sizeof(Ipp32f) + bufSize_C_Y)),
        64);

    int numThreads = 1;
    ippGetNumThreads_T((int *)&numThreads);
    *pSizeBuf = IPP_ALIGNED_SIZE(*pSizeBuf, 64) * numThreads;

    if (*pSizeSpec > 0)
        *pSizeSpec += RESERVE_SIZE;
    if (*pSizeInit > 0)
        *pSizeInit += RESERVE_SIZE;
    if (*pSizeBuf > 0)
        *pSizeBuf += RESERVE_SIZE;

    return ippStsNoErr;
}
static IppStatus ils_t_DFTInit_R_32f(IppiSize roiSize, int flag, IppHintAlgorithm hint, IlsDFTContext_32f *pDFTSpec, Ipp8u *pMemInit)
{
    IlsDFTContext_32f *ctxDFT = NULL;
    IppStatus res = ippStsNoErr;
    int lenX, lenY;
    int bufSize_R_X, bufSize_R_Y, bufSize_C_Y;
    Ipp32s sizeSpec = 0;
    Ipp32s sizeInit = 0;
    Ipp32s sizeBuf = 0;

    ctxDFT = (IlsDFTContext_32f *)IPP_ALIGNED_PTR(pDFTSpec, 64);
    pDFTSpec = (IlsDFTContext_32f *)((Ipp8u *)ctxDFT + IPP_ALIGNED_SIZE(sizeof(IlsDFTContext_32f), 64));

    ippsSet_8u(0, (Ipp8u *)ctxDFT, sizeof(IlsDFTContext_32f));

    ctxDFT->lengthX = lenX = roiSize.width;
    ctxDFT->lengthY = lenY = roiSize.height;
    ctxDFT->hint = hint;

    res = ippsDFTGetSize_R_32f(lenX, flag, hint, &sizeSpec, &sizeInit, &sizeBuf);
    if (res != ippStsNoErr)
        return res;

    pMemInit = (Ipp8u *)IPP_ALIGNED_PTR(pMemInit, 64);

    ctxDFT->ctxDFT_R_X = (ilsDFTContext_32f *)ALIGNED_ADDR(pDFTSpec);
    pDFTSpec = (IlsDFTContext_32f *)((Ipp8u *)ctxDFT->ctxDFT_R_X + IPP_ALIGNED_SIZE(sizeSpec, 64));

    res = ippsDFTInit_R_32f(lenX, flag, hint, (IppsDFTSpec_R_32f *)ctxDFT->ctxDFT_R_X, pMemInit);
    if (res != ippStsNoErr)
        return res;

    bufSize_R_X = sizeBuf;

    if (lenX != lenY) {
        if (sizeInit) {
            pMemInit += IPP_ALIGNED_SIZE(sizeInit, 64);
        }

        res = ippsDFTGetSize_R_32f(lenY, flag, hint, &sizeSpec, &sizeInit, &sizeBuf);
        if (res != ippStsNoErr)
            return res;

        ctxDFT->ctxDFT_R_Y = (ilsDFTContext_32f *)ALIGNED_ADDR(pDFTSpec);
        pDFTSpec = (IlsDFTContext_32f *)((Ipp8u *)ctxDFT->ctxDFT_R_Y + IPP_ALIGNED_SIZE(sizeSpec, 64));

        res = ippsDFTInit_R_32f(lenY, flag, hint, (IppsDFTSpec_R_32f *)ctxDFT->ctxDFT_R_Y, pMemInit);
        if (res != ippStsNoErr)
            return res;

        if (sizeInit) {
            pMemInit += IPP_ALIGNED_SIZE(sizeInit, 64);
        }

        bufSize_R_Y = sizeBuf;
    } else {
        bufSize_R_Y = bufSize_R_X;
    }

    res = ippsDFTGetSize_C_32fc(lenY, flag, hint, &sizeSpec, &sizeInit, &sizeBuf);
    if (res != ippStsNoErr)
        return res;

    ctxDFT->ctxDFT_C_Y = (ilsDFTContext_32f *)ALIGNED_ADDR(pDFTSpec);

    res = ippsDFTInit_C_32fc(lenY, flag, hint, (IppsDFTSpec_C_32fc *)ctxDFT->ctxDFT_C_Y, pMemInit);
    if (res != ippStsNoErr)
        return res;

    bufSize_C_Y = sizeBuf;

    ctxDFT->bufSize =
        IPP_MAX(4 * lenX * sizeof(Ipp32f) + bufSize_R_X, IPP_MAX(4 * lenY * sizeof(Ipp32f) + bufSize_R_Y, 16 * lenY * sizeof(Ipp32f) + bufSize_C_Y));

    return ippStsNoErr;
}
#define COL2ROW(S, ST, D, N)                \
    {                                       \
        Ipp32f *s = S;                      \
        for (j = 0; j < N; j++) {           \
            D[j] = (*s);                    \
            s = (Ipp32f *)((char *)s + ST); \
        }                                   \
    }
#define ROW2COL(S, D, ST, N)                \
    {                                       \
        Ipp32f *d = (Ipp32f *)(D);          \
        for (j = 0; j < N; j++) {           \
            (*d) = S[j];                    \
            d = (Ipp32f *)((char *)d + ST); \
        }                                   \
    }
static IppStatus fftfwd_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    Ipp8u *b0 = info->buf + t * info->ctxDFT->bufSize;
    ;
    Ipp32f *curSrc = (Ipp32f *)((char *)info->pSrc1 + t * info->threadLen * info->src1Step);
    Ipp32f *curDst = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);
    int curLen = info->threadLen;
    if (t == info->nThreads - 1)
        curLen += info->tail;
    else if (t >= info->nThreads)
        curLen = 0;
    for (j = 0; j < curLen; j++) {
        ippsDFTFwd_RToPack_32f(curSrc, curDst, (const IppsDFTSpec_R_32f *)info->ctxDFT->ctxDFT_R_X, b0);
        curSrc = (Ipp32f *)((char *)curSrc + info->src1Step);
        curDst = (Ipp32f *)((char *)curDst + info->dstStep);
    }
    return ippStsOk;
}
static IppStatus fftinv_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    Ipp8u *b0 = info->buf + t * info->ctxDFT->bufSize;
    ;
    Ipp32f *curDst = (Ipp32f *)((char *)info->pDst + t * info->threadLen * info->dstStep);
    int curLen = info->threadLen;
    if (t == info->nThreads - 1)
        curLen += info->tail;
    else if (t >= info->nThreads)
        curLen = 0;
    for (j = 0; j < curLen; j++) {
        ippsDFTInv_PackToR_32f(curDst, curDst, (const IppsDFTSpec_R_32f *)info->ctxDFT->ctxDFT_R_X, b0);
        curDst = (Ipp32f *)((char *)curDst + info->dstStep);
    }
    return ippStsOk;
}
static IppStatus fftfwd_c_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    Ipp32f *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
    b0 = (Ipp32f *)(info->buf + t * info->ctxDFT->bufSize);

    int lenY = info->ctxDFT->lengthY;
    int i, curLen;
    Ipp32f *curDst;
    curDst = info->pDst + 1 + t * info->threadLen * 8 * 2;
    b1 = b0 + 2 * lenY;
    b2 = b1 + 2 * lenY;
    b3 = b2 + 2 * lenY;
    b4 = b3 + 2 * lenY;
    b5 = b4 + 2 * lenY;
    b6 = b5 + 2 * lenY;
    b7 = b6 + 2 * lenY;
    curLen = info->threadLen;
    if (t == info->nThreads - 1)
        curLen += info->tail;
    else if (t >= info->nThreads)
        curLen = 0;
    for (i = 0; i < curLen; i++) {
        ils_cDftMerge_32fc((Ipp32fc *)curDst, info->dstStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b4, (Ipp32fc *)b4, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b5, (Ipp32fc *)b5, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b6, (Ipp32fc *)b6, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b7, (Ipp32fc *)b7, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ils_cDftSplit_32fc((Ipp32fc *)curDst, info->dstStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
        curDst += 8 * 2;
    }
    return ippStsOk;
}
static IppStatus fftinv_c_t_fun(int t, void *arg)
{
    int j;
    ils_info_t *info = (ils_info_t *)arg;
    Ipp32f *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
    b0 = (Ipp32f *)(info->buf + t * info->ctxDFT->bufSize);

    int lenY = info->ctxDFT->lengthY;
    int i, curLen;
    Ipp32f *curSrc = info->pSrc1 + 1 + t * info->threadLen * 8 * 2;
    Ipp32f *curDst = info->pDst + 1 + t * info->threadLen * 8 * 2;
    b1 = b0 + 2 * lenY;
    b2 = b1 + 2 * lenY;
    b3 = b2 + 2 * lenY;
    b4 = b3 + 2 * lenY;
    b5 = b4 + 2 * lenY;
    b6 = b5 + 2 * lenY;
    b7 = b6 + 2 * lenY;
    curLen = info->threadLen;
    if (t == info->nThreads - 1)
        curLen += info->tail;
    else if (t >= info->nThreads)
        curLen = 0;
    for (i = 0; i < curLen; i++) {
        ils_cDftMerge_32fc((Ipp32fc *)curSrc, info->src1Step, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
        ippsDFTInv_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b4, (Ipp32fc *)b4, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b5, (Ipp32fc *)b5, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b6, (Ipp32fc *)b6, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b7, (Ipp32fc *)b7, (const IppsDFTSpec_C_32fc *)info->ctxDFT->ctxDFT_C_Y, (Ipp8u *)(b0 + 16 * lenY));
        ils_cDftSplit_32fc((Ipp32fc *)curDst, info->dstStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
        curSrc += 8 * 2;
        curDst += 8 * 2;
    }
    return ippStsOk;
}
static IppStatus ils_DFTFwd_RToPack_32f_C1R(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IlsDFTContext_32f *pDFTSpec, Ipp8u *pBuffer,
                                            int numThreads)
{
    IlsDFTContext_32f *ctxDFT = (IlsDFTContext_32f *)ALIGNED_ADDR(pDFTSpec);
    IppsDFTSpec_R_32f *ctx_R_X, *ctx_R_Y;
    IppsDFTSpec_C_32fc *ctx_C_Y;
    int lenX, lenY;
    Ipp8u *buf;
    Ipp32f *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
    Ipp32f *s, *d;
    int i, j;
    buf = (Ipp8u *)ALIGNED_ADDR(pBuffer);
    b0 = (Ipp32f *)buf;
    lenX = ctxDFT->lengthX;
    lenY = ctxDFT->lengthY;
    ctx_R_X = (IppsDFTSpec_R_32f *)ctxDFT->ctxDFT_R_X;
    ctx_R_Y = (IppsDFTSpec_R_32f *)ctxDFT->ctxDFT_R_Y;
    ctx_C_Y = (IppsDFTSpec_C_32fc *)ctxDFT->ctxDFT_C_Y;

    if (ctx_R_Y == NULL)
        ctx_R_Y = ctx_R_X;
    if (lenY == 1) {
        return ippsDFTFwd_RToPack_32f(pSrc, pDst, ctx_R_X, buf);
    }
    if (lenX == 1) {
        COL2ROW(pSrc, srcStep, b0, lenY)
        ippsDFTFwd_RToPack_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
        d = pDst;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[j];
            d = (Ipp32f *)((char *)d + dstStep);
        }
        return ippStsNoErr;
    }
    if (numThreads > 1 && USE_THREADS(lenX, lenY)) {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.nThreads = numThreads;
        info.threadLen = lenY / numThreads;
        info.tail = lenY % numThreads;
        info.ctxDFT = ctxDFT;
        info.buf = buf;
        ippParallelFor_T(numThreads, (void *)&info, fftfwd_t_fun);
    } else {
        s = (Ipp32f *)pSrc;
        d = pDst;
        for (j = 0; j < lenY; j++) {
            ippsDFTFwd_RToPack_32f(s, d, ctx_R_X, buf);
            s = (Ipp32f *)((char *)s + srcStep);
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }

    COL2ROW(pDst, dstStep, b0, lenY)
    ippsDFTFwd_RToPack_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
    ROW2COL(b0, pDst, dstStep, lenY)

    i = 1;
    b1 = b0 + 2 * lenY;
    b2 = b1 + 2 * lenY;
    b3 = b2 + 2 * lenY;

    if (numThreads > 1 && USE_THREADS(lenX, lenY)) {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.nThreads = numThreads;
        info.threadLen = ((lenX - 1) / 16) / numThreads;
        info.tail = ((lenX - 1) / 16) % numThreads;
        info.ctxDFT = ctxDFT;
        info.buf = buf;
        ippParallelFor_T(numThreads, (void *)&info, fftfwd_c_t_fun);
        i = ((lenX - 1) & ~15) + 1;
    } else if (BIG_IMAGE(lenX, lenY)) {
        b4 = b3 + 2 * lenY;
        b5 = b4 + 2 * lenY;
        b6 = b5 + 2 * lenY;
        b7 = b6 + 2 * lenY;
        for (; i < ((lenX - 1) & ~15) + 1; i += 16) {
            d = pDst + i;
            ils_cDftMerge_32fc((Ipp32fc *)d, dstStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b4, (Ipp32fc *)b4, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b5, (Ipp32fc *)b5, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b6, (Ipp32fc *)b6, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTFwd_CToC_32fc((Ipp32fc *)b7, (Ipp32fc *)b7, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ils_cDftSplit_32fc((Ipp32fc *)d, dstStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
        }
        i = ((lenX - 1) & ~15) + 1;
    }
    for (; i < ((lenX - 1) & ~7) + 1; i += 8) {
        d = pDst + i;
        for (j = 0; j < lenY; j++) {
            b0[2 * j] = d[0];
            b0[2 * j + 1] = d[1];
            b1[2 * j] = d[2];
            b1[2 * j + 1] = d[3];
            b2[2 * j] = d[4];
            b2[2 * j + 1] = d[5];
            b3[2 * j] = d[6];
            b3[2 * j + 1] = d[7];
            d = (Ipp32f *)((char *)d + dstStep);
        }
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        d = pDst + i;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[2 * j];
            d[1] = b0[2 * j + 1];
            d[2] = b1[2 * j];
            d[3] = b1[2 * j + 1];
            d[4] = b2[2 * j];
            d[5] = b2[2 * j + 1];
            d[6] = b3[2 * j];
            d[7] = b3[2 * j + 1];
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }
    for (i = ((lenX - 1) & ~7) + 1; i < lenX - 1; i += 2) {
        d = pDst + i;
        for (j = 0; j < lenY; j++) {
            b0[2 * j] = d[0];
            b0[2 * j + 1] = d[1];
            d = (Ipp32f *)((char *)d + dstStep);
        }
        ippsDFTFwd_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 2 * lenY));
        d = pDst + i;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[2 * j];
            d[1] = b0[2 * j + 1];
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }
    if (lenX & 1) {
        return ippStsNoErr;
    }
    COL2ROW(pDst + lenX - 1, dstStep, b0, lenY)
    ippsDFTFwd_RToPack_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
    ROW2COL(b0, pDst + lenX - 1, dstStep, lenY)
    return ippStsNoErr;
}
static IppStatus ils_DFTInv_PackToR_32f_C1R(Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, IlsDFTContext_32f *pDFTSpec, Ipp8u *pBuffer,
                                            int numThreads)
{
    IlsDFTContext_32f *ctxDFT = (IlsDFTContext_32f *)ALIGNED_ADDR(pDFTSpec);
    IppsDFTSpec_R_32f *ctx_R_X, *ctx_R_Y;
    IppsDFTSpec_C_32fc *ctx_C_Y;
    // IppStatus           res;
    int lenX, lenY;
    Ipp8u *buf;
    Ipp32f *b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7;
    Ipp32f *s, *d;
    int i, j;

    buf = (Ipp8u *)ALIGNED_ADDR(pBuffer);
    b0 = (Ipp32f *)buf;

    lenX = ctxDFT->lengthX;
    lenY = ctxDFT->lengthY;
    ctx_R_X = (IppsDFTSpec_R_32f *)ctxDFT->ctxDFT_R_X;
    ctx_R_Y = (IppsDFTSpec_R_32f *)ctxDFT->ctxDFT_R_Y;
    ctx_C_Y = (IppsDFTSpec_C_32fc *)ctxDFT->ctxDFT_C_Y;
    if (ctx_R_Y == NULL)
        ctx_R_Y = ctx_R_X;

    if (lenY == 1) {
        ippsDFTInv_PackToR_32f(pSrc, pDst, ctx_R_X, buf);
        return ippStsOk;
    }
    if (lenX == 1) {
        s = (Ipp32f *)pSrc;
        for (j = 0; j < lenY; j++) {
            b0[j] = s[0];
            s = (Ipp32f *)((char *)s + srcStep);
        }
        ippsDFTInv_PackToR_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
        d = pDst;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[j];
            d = (Ipp32f *)((char *)d + dstStep);
        }
        return ippStsOk;
    }

    s = (Ipp32f *)pSrc;
    COL2ROW(pSrc, srcStep, b0, lenY)
    ippsDFTInv_PackToR_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
    ROW2COL(b0, pDst, dstStep, lenY)

    i = 1;
    b1 = b0 + 2 * lenY;
    b2 = b1 + 2 * lenY;
    b3 = b2 + 2 * lenY;

    if (numThreads > 1 && USE_THREADS(lenX, lenY)) {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.nThreads = numThreads;
        info.threadLen = ((lenX - 1) / 16) / numThreads;
        info.tail = ((lenX - 1) / 16) % numThreads;
        info.ctxDFT = ctxDFT;
        info.buf = buf;
        ippParallelFor_T(numThreads, (void *)&info, fftinv_c_t_fun);
        i = ((lenX - 1) & ~15) + 1;
    } else if (BIG_IMAGE(lenX, lenY)) {
        b4 = b3 + 2 * lenY;
        b5 = b4 + 2 * lenY;
        b6 = b5 + 2 * lenY;
        b7 = b6 + 2 * lenY;
        for (; i < ((lenX - 1) & ~15) + 1; i += 16) {
            s = (Ipp32f *)pSrc + i;
            ils_cDftMerge_32fc((Ipp32fc *)s, srcStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
            ippsDFTInv_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b4, (Ipp32fc *)b4, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b5, (Ipp32fc *)b5, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b6, (Ipp32fc *)b6, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            ippsDFTInv_CToC_32fc((Ipp32fc *)b7, (Ipp32fc *)b7, ctx_C_Y, (Ipp8u *)(b0 + 16 * lenY));
            d = pDst + i;
            ils_cDftSplit_32fc((Ipp32fc *)d, dstStep, (Ipp32fc *)b0, 2 * lenY * sizeof(Ipp32f), lenY);
        }
        i = ((lenX - 1) & ~15) + 1;
    }

    for (; i < ((lenX - 1) & ~7) + 1; i += 8) {
        s = (Ipp32f *)pSrc + i;
        for (j = 0; j < lenY; j++) {
            b0[2 * j] = s[0];
            b0[2 * j + 1] = s[1];
            b1[2 * j] = s[2];
            b1[2 * j + 1] = s[3];
            b2[2 * j] = s[4];
            b2[2 * j + 1] = s[5];
            b3[2 * j] = s[6];
            b3[2 * j + 1] = s[7];
            s = (Ipp32f *)((char *)s + srcStep);
        }
        ippsDFTInv_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        ippsDFTInv_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        d = pDst + i;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[2 * j];
            d[1] = b0[2 * j + 1];
            d[2] = b1[2 * j];
            d[3] = b1[2 * j + 1];
            d[4] = b2[2 * j];
            d[5] = b2[2 * j + 1];
            d[6] = b3[2 * j];
            d[7] = b3[2 * j + 1];
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }
    for (i = ((lenX - 1) & ~7) + 1; i < lenX - 1; i += 2) {
        s = (Ipp32f *)pSrc + i;
        for (j = 0; j < lenY; j++) {
            b0[2 * j] = s[0];
            b0[2 * j + 1] = s[1];
            s = (Ipp32f *)((char *)s + srcStep);
        }
        ippsDFTInv_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 2 * lenY));
        d = pDst + i;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[2 * j];
            d[1] = b0[2 * j + 1];
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }

    if (!(lenX & 1)) {
        s = (Ipp32f *)pSrc + lenX - 1;
        for (j = 0; j < lenY; j++) {
            b0[j] = s[0];
            s = (Ipp32f *)((char *)s + srcStep);
        }
        ippsDFTInv_PackToR_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
        d = pDst + lenX - 1;
        for (j = 0; j < lenY; j++) {
            d[0] = b0[j];
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }

    if (numThreads > 1 && USE_THREADS(lenX, lenY)) {
        ils_info_t info;
        info.pSrc1 = pSrc;
        info.src1Step = srcStep;
        info.pDst = pDst;
        info.dstStep = dstStep;
        info.nThreads = numThreads;
        info.threadLen = lenY / numThreads;
        info.tail = lenY % numThreads;
        info.ctxDFT = ctxDFT;
        info.buf = buf;
        ippParallelFor_T(numThreads, (void *)&info, fftinv_t_fun);
    } else {
        d = pDst;
        for (j = 0; j < lenY; j++) {
            ippsDFTInv_PackToR_32f(d, d, ctx_R_X, buf);
            d = (Ipp32f *)((char *)d + dstStep);
        }
    }
    return (ippStsNoErr);
}
IPPFUN(IppStatus, ippiFilterILSGetBufferSize_T,
       (IppiFilterILSType filter, IppiSize roiSize, IppDataType dataType, int numChannels, int *pSpecSize, int *pInitSize, int *pBufSize))
{
    if (!((filter == ippiFilterILS_Norm) || (filter == ippiFilterILS_Welsch)))
        return ippStsBadArgErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (dataType != ipp8u)
        return ippStsDataTypeErr;
    if (!(numChannels == 1 || numChannels == 3))
        return ippStsNumChannelsErr;
    if (pSpecSize == 0 || pInitSize == 0 || pBufSize == 0)
        return ippStsNullPtrErr;

    int dw = roiSize.width;
    int dh = roiSize.height;
    int bufSize = 0;
    int initSize = 0;
    int specSize = 0;
    int su = ALGN64(dw * sizeof(Ipp8u));
    int st = ALGN64(dw * sizeof(Ipp32f));
    int flag = IPP_FFT_DIV_INV_BY_N;
    int dftSizeSpec = 0, dftSizeInit = 0, dftSizeBuf = 0;
    IppHintAlgorithm hint = ippAlgHintNone;

    ils_t_DFTGetSize_R_32f(roiSize, flag, hint, &dftSizeSpec, &dftSizeInit, &dftSizeBuf);

    specSize = sizeof(IppiFilterILSSpec) + dftSizeSpec + dh * st; // pDenorminInv

    initSize = dh * st +     // pPsf
               dh * st +     // pPsfDft
               dh * st +     // pPsfDftA
               dh * 2 * st + // pPsfDftE
               dftSizeInit + dftSizeBuf;

    bufSize = dh * su + // pU_8u
              dh * st + // pU
              dh * st + // pDftNrm1
              dh * st + // pD
              dh * st + // pMU
              dh * st + // pN2H
              dh * st + // pN2HV
              dftSizeBuf;

    *pBufSize = bufSize;
    *pInitSize = initSize;
    *pSpecSize = specSize;

    return ippStsOk;
}
IPPFUN(IppStatus, ippiFilterILSInit_T,
       (IppiFilterILSType filter, IppiSize roiSize, IppDataType dataType, int numChannels, Ipp64f lambda, Ipp64f eps, Ipp64f p, Ipp64f gamma,
        IppiFilterILSSpec *pSpec, Ipp8u *pBuf))
{

    if (!((filter == ippiFilterILS_Norm) || (filter == ippiFilterILS_Welsch)))
        return ippStsBadArgErr;
    if (roiSize.width <= 0 || roiSize.height <= 0)
        return ippStsSizeErr;
    if (dataType != ipp8u)
        return ippStsDataTypeErr;
    if (!(numChannels == 1 || numChannels == 3))
        return ippStsNumChannelsErr;
    if (pSpec == 0 || pBuf == 0)
        return ippStsNullPtrErr;

    int dw = roiSize.width;
    int dh = roiSize.height;
    int N = dh, M = dw, D = 3;
    int flag = IPP_FFT_DIV_INV_BY_N;
    int st = ALGN64(dw * sizeof(Ipp32f));
    IppHintAlgorithm hint = ippAlgHintNone;
    if (filter == ippiFilterILS_Norm) {
        pSpec->lambda = lambda;
        pSpec->eps = eps;
        pSpec->p = p;
        pSpec->gamma = 0.5 * p - 1;
        Ipp32f e = eps;
        Ipp32f epsp;
        ippsPowx_32f_A21(&e, (Ipp32f)pSpec->gamma, &epsp, 1);
        pSpec->c = p * epsp;
    } else {
        pSpec->lambda = lambda;
        pSpec->gamma = gamma;
        pSpec->c = 2.0;
        pSpec->p = 0.0f;
        pSpec->eps = 0.0f;
    }
    pSpec->filter = filter;

    Ipp32f *pPsf = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pPsfDft = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32fc *pPsfDftE = (Ipp32fc *)pBuf;
    pBuf += 2 * dh * st;
    Ipp32f *pPsfDftA = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pDenormin = pPsf;

    int dftSizeSpec, dftSizeInit, dftSizeBuf;
    ils_t_DFTGetSize_R_32f(roiSize, flag, hint, &dftSizeSpec, &dftSizeInit, &dftSizeBuf);

    Ipp8u *pMemInit = (Ipp8u *)pBuf;
    pBuf += dftSizeInit;
    Ipp8u *pBufferDft = (Ipp8u *)pBuf;
    pBuf += dftSizeBuf;

    pSpec->pDFTSpec = (IlsDFTContext_32f *)((Ipp8u *)pSpec + sizeof(IppiFilterILSSpec));
    pSpec->pDenorminInv = (Ipp32f *)((Ipp8u *)pSpec->pDFTSpec + dftSizeSpec);

    ils_t_DFTInit_R_32f(roiSize, flag, hint, pSpec->pDFTSpec, pMemInit);
    ippiSet_32f_C1R(0.0, pPsf, st, roiSize);
    pPsf[0] = -1;
    pPsf[M - 1] = 1;
    ils_DFTFwd_RToPack_32f_C1R(pPsf, st, pPsfDft, st, pSpec->pDFTSpec, pBufferDft, 1);
    ippiPackToCplxExtend_32f32fc_C1R(pPsfDft, roiSize, st, pPsfDftE, 2 * st);
    ippiMagnitude_32fc32f_C1R(pPsfDftE, 2 * st, pPsfDftA, st, roiSize);
    ippiSqr_32f_C1IR(pPsfDftA, st, roiSize);

    pPsf[M - 1] = 0.0f;
    Ipp32f *psf = (Ipp32f *)((Ipp8u *)pPsf + (N - 1) * st);
    psf[0] = 1.0f;
    ils_DFTFwd_RToPack_32f_C1R(pPsf, st, pPsfDft, st, pSpec->pDFTSpec, pBufferDft, 1);

    ippiPackToCplxExtend_32f32fc_C1R(pPsfDft, roiSize, st, pPsfDftE, 2 * st);
    ippiMagnitude_32fc32f_C1R(pPsfDftE, 2 * st, pDenormin, st, roiSize);
    ippiSqr_32f_C1IR(pDenormin, st, roiSize);
    ippiAdd_32f_C1IR(pPsfDftA, st, pDenormin, st, roiSize);

    ippiMulC_32f_C1IR(0.5 * pSpec->c * lambda, pDenormin, st, roiSize);
    ippiAddC_32f_C1IR(1.0, pDenormin, st, roiSize);

    // convert back to pack
    {
        // 32f->32fc
        int j;
        for (j = 0; j < roiSize.height; j++) {
            Ipp32f *s = (Ipp32f *)((Ipp8u *)pDenormin + j * st);
            Ipp32f *t = (Ipp32f *)pPsfDftA;
            Ipp32fc *d = (Ipp32fc *)((Ipp8u *)pPsfDftE + j * 2 * st);
            ippsDivCRev_32f(s, 1.0, t, roiSize.width);
            ippsRealToCplx_32f(t, NULL, d, roiSize.width);
        }
        // complex to pack
        ippiCplxExtendToPack_32fc32f_C1R(pPsfDftE, 2 * st, roiSize, pSpec->pDenorminInv, st);
    }
    return ippStsOk;
}
IppStatus owniFilterILS_8u_T(const Ipp8u *pSrc, int srcStep, Ipp8u *pDst, int dstStep, IppiSize roi, int iter, int nChannels,
                             IppiFilterILSSpec *pSpec, Ipp8u *pBuf)
{
    if (pSrc == 0 || pDst == 0 || pSpec == 0 || pBuf == 0)
        return ippStsNullPtrErr;
    if (srcStep < 0 || dstStep < 0)
        return ippStsStepErr;
    if (roi.width <= 0 || roi.height <= 0)
        return ippStsSizeErr;
    if (iter < 0)
        return ippStsBadArgErr;

    int dw = roi.width;
    int dh = roi.height;
    int k, chan;
    int su = ALGN64(dw * sizeof(Ipp8u));
    int st = ALGN64(dw * sizeof(Ipp32f));
    Ipp64f C = pSpec->c;
    Ipp64f P = pSpec->p;
    Ipp64f G = pSpec->gamma;
    Ipp64f E = pSpec->eps;
    Ipp64f L = 0.5 * pSpec->lambda;
    IppiFilterILSType filter = pSpec->filter;
    IlsDFTContext_32f *pDFTSpec = pSpec->pDFTSpec;
    Ipp8u *pU_8u = (Ipp8u *)pBuf;
    pBuf += dh * su;
    Ipp32f *pU = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pDftNrm1 = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pD = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pMU = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pN2H = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pN2HV = (Ipp32f *)pBuf;
    pBuf += dh * st;
    Ipp32f *pDftHV = pD;
    Ipp32f *pFU = pN2H;
    Ipp8u *pBufferDft = (Ipp8u *)pBuf;

#define dftfwd ils_DFTFwd_RToPack_32f_C1R
#define dftinv ils_DFTInv_PackToR_32f_C1R

    int numThreads = 1;
    ippGetNumThreads_T((int *)&numThreads);
    if (numThreads == 1) {
        for (chan = 0; chan < nChannels; chan++) {
            if (nChannels == 3) {
                ippiCopy_8u_C3C1R(pSrc + chan, srcStep, pU_8u, su, roi);
                ippiScaleC_8u32f_C1R(pU_8u, su, 1.0 / 255.0, 0.0, pU, st, roi, ippAlgHintAccurate); // F = single(F)
            } else {
                ippiScaleC_8u32f_C1R(pSrc, srcStep, 1.0 / 255.0, 0.0, pU, st, roi, ippAlgHintAccurate); // F = single(F)
            }
            dftfwd(pU, st, pDftNrm1, st, pDFTSpec, pBufferDft, numThreads); // Normin1 = fft2(U);
            for (k = 0; k < iter; k++) {
                ils_s_DiffHR_C1(pU, st, pD, st, roi);                  // u_h = [diff(U,1,2), U(:,1,:) - U(:,end,:)]
                ils_s_MU_C1(pD, st, C, P, G, E, filter, pMU, st, roi); // mu_h = c.*u_h - p.*u_h .* (u_h.*u_h + eps) .^ gamma;
                ils_s_DiffHL_C1(pMU, st, pN2H, st, roi);               // Normin2_h = [mu_h(:,end,:) - mu_h(:, 1,:), - diff(mu_h,1,2)];

                ils_s_DiffVU_C1(pU, st, pD, st, roi);                            // u_v = [diff(U,1,1); U(1,:,:) - U(end,:,:)];
                ils_s_MU_C1(pD, st, C, P, G, E, filter, pMU, st, roi);           // mu_v = c .* u_v - p .* u_v .* (u_v .* u_v + eps) .^ gamma;
                ils_s_DiffVD_C1(pMU, st, pN2HV, st, roi);                        // Normin2_v = [mu_v(end,:,:) - mu_v(1, :,:); - diff(mu_v,1,1)];
                ippiAdd_32f_C1IR(pN2H, st, pN2HV, st, roi);                      // Normin2_h + Normin2_v
                dftfwd(pN2HV, st, pDftHV, st, pDFTSpec, pBufferDft, numThreads); // fft2(Normin2_h + Normin2_v)
                ippiMulC_32f_C1IR(L, pDftHV, st, roi);                           // 0.5 * lambda * (fft2(Normin2_h + Normin2_v))
                ippiAdd_32f_C1IR(pDftHV, st, pDftNrm1, st, roi);                 //(Normin1 + 0.5 * lambda * (fft2(Normin2_h + Normin2_v)))
                ils_s_MulPack_32f_C1R(pSpec->pDenorminInv, st, pDftNrm1, st, pFU, st, roi);
                dftinv(pFU, st, pU, st, pDFTSpec, pBufferDft, 1); // U = real(ifft2(FU));
                ippiCopy_32f_C1R(pFU, st, pDftNrm1, st, roi);
            }
            ippiMulC_32f_C1IR(255.0, pU, st, roi);
            if (nChannels == 3) {
                ippiConvert_32f8u_C1R(pU, st, pU_8u, su, roi, ippRndNear);
                ippiCopy_8u_C1C3R(pU_8u, su, pDst + chan, dstStep, roi);
            } else {
                ippiConvert_32f8u_C1R(pU, st, pDst, dstStep, roi, ippRndNear);
            }
        }
    } else {
        for (chan = 0; chan < nChannels; chan++) {
            if (nChannels == 3) {
                ippiCopy_8u_C3C1R(pSrc + chan, srcStep, pU_8u, su, roi);
                ippiScaleC_8u32f_C1R(pU_8u, su, 1.0 / 255.0, 0.0, pU, st, roi, ippAlgHintAccurate);
            } else {
                ippiScaleC_8u32f_C1R(pSrc, srcStep, 1.0 / 255.0, 0.0, pU, st, roi, ippAlgHintAccurate); // F = single(F)
            }
            dftfwd(pU, st, pDftNrm1, st, pDFTSpec, pBufferDft, numThreads);
            for (k = 0; k < iter; k++) {
                ils_t_DiffHR_C1(pU, st, pD, st, roi, numThreads);
                ils_t_MU_C1(pD, st, C, P, G, E, filter, pMU, st, roi, numThreads);
                ils_t_DiffHL_C1(pMU, st, pN2H, st, roi, numThreads);

                ils_t_DiffVU_C1(pU, st, pD, st, roi, numThreads);
                ils_t_MU_C1(pD, st, C, P, G, E, filter, pMU, st, roi, numThreads);
                ils_t_DiffVD_C1(pMU, st, pN2HV, st, roi, numThreads);

                ils_t_Add_C1I(pN2H, st, pN2HV, st, roi, numThreads);

                dftfwd(pN2HV, st, pDftHV, st, pDFTSpec, pBufferDft, numThreads);
                ils_t_MulC_C1I(L, pDftHV, st, roi, numThreads);
                ils_t_Add_C1I(pDftHV, st, pDftNrm1, st, roi, numThreads);
                ils_t_MulPack_32f_C1R(pSpec->pDenorminInv, st, pDftNrm1, st, pDftNrm1, st, roi, numThreads);
                dftinv(pDftNrm1, st, pU, st, pDFTSpec, pBufferDft, numThreads);
            }
            ippiMulC_32f_C1IR(255.0, pU, st, roi);
            if (nChannels == 3) {
                ippiConvert_32f8u_C1R(pU, st, pU_8u, su, roi, ippRndNear);
                ippiCopy_8u_C1C3R(pU_8u, su, pDst + chan, dstStep, roi);
            } else {
                ippiConvert_32f8u_C1R(pU, st, pDst, dstStep, roi, ippRndNear);
            }
        }
    }
    return ippStsOk;
}
IPPFUN(IppStatus, ippiFilterILS_8u_C1R_T,
       (const Ipp8u *pSrc, int srcStep, Ipp8u *pDst, int dstStep, IppiSize roi, int iter, IppiFilterILSSpec *pSpec, Ipp8u *pBuf))
{
    return owniFilterILS_8u_T(pSrc, srcStep, pDst, dstStep, roi, iter, 1, pSpec, pBuf);
}
IPPFUN(IppStatus, ippiFilterILS_8u_C3R_T,
       (const Ipp8u *pSrc, int srcStep, Ipp8u *pDst, int dstStep, IppiSize roi, int iter, IppiFilterILSSpec *pSpec, Ipp8u *pBuf))
{
    return owniFilterILS_8u_T(pSrc, srcStep, pDst, dstStep, roi, iter, 3, pSpec, pBuf);
}
