柚子快報激活碼778899分享:縮放算法優(yōu)化步驟詳解
柚子快報激活碼778899分享:縮放算法優(yōu)化步驟詳解
添加鏈接描述
背景
假設數(shù)據(jù)存放在在unsigned char* m_pData 里面,寬和高分別是:m_nDataWidth m_nDataHeight 給定縮放比例:fXZoom fYZoom,返回縮放后的unsigned char* dataZoom 這里采用最簡單的縮放算法即: 根據(jù)比例計算原圖和縮放后圖坐標的對應關(guān)系:縮放后圖坐標*縮放比例 = 原圖坐標
原始代碼 未優(yōu)化
#pragma once
class zoomBlock
{
public:
zoomBlock() {};
~zoomBlock();
void zoomDataSSE128(unsigned char* dataZoom, float fXZoom, float fYZoom);
void zoomData(unsigned char* dataZoom, float fXZoom, float fYZoom);
void test(float fXZoom =0.5, float fYZoom=0.5);
void init(int DataWidth, int DataHeight);
private:
void computeSrcValues(int* srcValues, size_t size, float zoom, int dataSize);
private:
unsigned char* m_pData = nullptr;
float m_fXZoom = 1 ;//x軸縮放比例 m_nXZoom=1時 不縮放
float m_fYZoom = 1 ;//y軸縮放比例
int m_nDataWidth = 0;
int m_nDataHeight = 0;
};
#include "zoomBlock.h"
#include
#include
#include
#define SAFE_DELETE_ARRAY(p) { if( (p) != NULL ) delete[] (p); (p) = NULL; }
zoomBlock::~zoomBlock()
{
SAFE_DELETE_ARRAY(m_pData);
}
void zoomBlock::init(int DataWidth, int DataHeight)
{
m_nDataWidth = DataWidth;
m_nDataHeight = DataHeight;
m_pData = new unsigned char[m_nDataWidth* m_nDataHeight];
for (int i = 0; i < m_nDataWidth * m_nDataHeight; ++i)
{
m_pData[i] = static_cast
}
}
void zoomBlock::zoomData(unsigned char* dataZoom, float fXZoom, float fYZoom)
{
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
for (size_t row = 0; row < nZoomDataHeight; row++)
{
for (size_t column = 0; column < nZoomDataWidth; column ++)
{
//1
int srcx = std::min(int(row / fYZoom), m_nDataHeight - 1);
int srcy = std::min(int(column / fXZoom), m_nDataWidth - 1);
//2
int srcPos = srcx * m_nDataHeight + srcy;
int desPos = row * nZoomDataHeight + column;
dataZoom[desPos] = m_pData[srcPos];
}
}
}
void zoomBlock::test(float fXZoom, float fYZoom)
{
init(8,8);
std::cout << "Values in m_pData:" << std::endl;
for (int i = 0; i < m_nDataWidth * m_nDataHeight; ++i)
{
std::cout << std::setw(4) << static_cast
if ((i + 1) % m_nDataWidth == 0)
{ // Adjust the value based on your data
std::cout << std::endl;
}
}
unsigned char* dataZoom = new unsigned char[fXZoom * m_nDataWidth * fYZoom * m_nDataHeight];
zoomData(dataZoom, fXZoom, fYZoom);
// Print or inspect the values in m_dataZoom
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
std::cout << "Values in m_dataZoom:" << std::endl;
for (int i = 0; i < nZoomDataHeight * nZoomDataWidth; ++i)
{
std::cout << std::setw(4)<< static_cast
if ((i + 1) % nZoomDataWidth == 0) { // Adjust the value based on your data
std::cout << std::endl;
}
}
SAFE_DELETE_ARRAY(dataZoom);
}
測試代碼
int main()
{
zoomBlock zoomBlocktest;
zoomBlocktest.test(1.5,1.5);
return 0;
}
其中函數(shù) ·void zoomBlock::zoomData(unsigned char* dataZoom, float fXZoom, float fYZoom)· 沒有使用任何加速優(yōu)化,現(xiàn)在來分析它。
sse128
我們知道sse128可以一次性處理4個int類型,所以我們把最后一層for循環(huán)改成,4個坐標的算法,不滿4個的單獨計算
void zoomBlock::zoomDataSSE128(unsigned char* dataZoom, float fXZoom, float fYZoom)
{
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
for (size_t row = 0; row < nZoomDataHeight; row++)
{
int remian = nZoomDataWidth % 4;
for (size_t column = 0; column < nZoomDataWidth - remian; column += 4)
{
//第一個坐標
int srcx = std::min(int(row / fYZoom), m_nDataHeight - 1);
int srcy = std::min(int(column / fXZoom), m_nDataWidth - 1);
int srcPos = srcx * m_nDataHeight + srcy;
int desPos = row * nZoomDataHeight + column;
dataZoom[desPos] = m_pData[srcPos];
//第二個坐標
int srcx1 = std::min(int((row+1) / fYZoom), m_nDataHeight - 1);
int srcy1 = std::min(int((column+1) / fXZoom), m_nDataWidth - 1);
int srcPos1 = srcx1 * m_nDataHeight + srcy1;
int desPos1 = (row+1) * nZoomDataHeight + column+1;
dataZoom[desPos1] = m_pData[srcPos1];
//第3個坐標
// 。。。
//第4個坐標
// 。。。
}
// Process the remaining elements (if any) without SSE
for (size_t column = nZoomDataWidth - remian; column < nZoomDataWidth; column++)
{
int srcx = std::min(int(row / fYZoom), m_nDataHeight - 1);
int srcy = std::min(int(column / fXZoom), m_nDataWidth - 1);
int srcPos = srcx * m_nDataHeight + srcy;
int desPos = row * nZoomDataHeight + column;
dataZoom[desPos] = m_pData[srcPos];
}
}
}
上面 一次處理四個坐標的代碼要改成sse的代碼
在最里層的循環(huán)里面,每次都要計算 row / fYZoom 和 column / fXZoom,這個實際上可以挪出for循環(huán),計算一次存到數(shù)組里
數(shù)據(jù)坐標desPos和srcPos ,必須放在最內(nèi)存的循環(huán)里
所以我們用calculateSrcIndex函數(shù)單獨處理 row / fYZoom 和 column / fXZoom,希望達到如下效果:
void calculateSrcIndex(int* srcValues, int size, float zoom,int max)
{
for (int i = 0; i < size; i++)
{
srcValues[i] = std::min(int(i/zoom),max);
}
}
改成sse:
void calculateSrcIndex(int* srcValues, int size, float zoom,int max)
{
__m128i mmIndex, mmSrcValue, mmMax;
mmMax = _mm_set1_epi32(max);
float zoomReciprocal = 1.0f / zoom;
int remian = size % 4;
for (size_t i = 0; i < size - remian; i += 4)
{
mmIndex = _mm_set_epi32(i + 3, i + 2, i + 1, i);
mmSrcValue = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(mmIndex), _mm_set1_ps(zoomReciprocal)));
// Ensure srcValues are within the valid range [0, max]
mmSrcValue = _mm_min_epi32(mmSrcValue, mmMax);
// Store the result to the srcValues array
_mm_storeu_si128(reinterpret_cast<__m128i*>(&srcValues[i]), mmSrcValue);
}
// Process the remaining elements (if any) without SSE
for (size_t i = size - remian; i < size; i++)
{
srcValues[i] = std::min(int(i / zoom), max);
}
}
解釋: 這里主要處理int型數(shù)據(jù),為了使用sse加速,要使用__m128i類型來存儲4個int
加載int到__m128i:
__m128i _mm_set1_epi32(int i); 這個指令是使用1個i,來設置__m128i,將__m128i看做4個32位的部分,則每個部分都被賦為i; __m128i _mm_set_epi32(int i3, int i2,int i1, int i0); 說明:使用4個int(32bits)變量來設置__m128i變量; 返回值:如果返回值__m128i,分為r0,r1,r2,r3返回值規(guī)則如下:
r0 := i0 r1 := i1 r2 := i2 r3 := i3
__m128i _mm_cvtps_epi32 (__m128 a) Converts packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements.
加載float到__m128
__m128 _mm_set1_ps(float w) 對應于_mm_load1_ps的功能,不需要字節(jié)對齊,需要多條指令。(r0 = r1 = r2 = r3 = w)__m128 _mm_cvtepi32_ps (__m128i a) Converts packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements.
float乘法
__m128 dst = _mm_mul_ps (__m128 a, __m128 b) 將a, b中的32位浮點數(shù)相乘,結(jié)果打包給dst
取最小值
__m128i _mm_min_epi32 (__m128i a, __m128i b) Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst. Operation FOR j := 0 to 3 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR
所以代碼修改為
int* srcX = new int[nZoomDataHeight];
int* srcY = new int[nZoomDataWidth];
calculateSrcIndex(srcX, nZoomDataHeight, fXZoom , m_nDataHeight - 1);
calculateSrcIndex(srcY, nZoomDataWidth, fYZoom, m_nDataWidth - 1);
for (size_t row = 0; row < nZoomDataHeight; row++)
{
int remian = nZoomDataWidth % 4;
for (size_t column = 0; column < nZoomDataWidth - remian; column += 4)
{
//第一個坐標
int srcPos = srcX[row] * m_nDataHeight + srcY[column];
int desPos = row * nZoomDataHeight + column;
dataZoom[desPos] = m_pData[srcPos];
...
}
}
然后把坐標的計算轉(zhuǎn)為sse
void zoomBlock::zoomDataSSE128(unsigned char* dataZoom, float fXZoom, float fYZoom)
{
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
int* srcX = new int[nZoomDataWidth];
int* srcY = new int[nZoomDataHeight];
calculateSrcIndex(srcX, nZoomDataWidth, fXZoom, m_nDataWidth - 1);
calculateSrcIndex(srcY, nZoomDataHeight, fYZoom, m_nDataHeight - 1);
for (size_t y = 0; y < nZoomDataHeight; y++)
{
int remian = nZoomDataWidth % 4;
for (size_t x = 0; x < nZoomDataWidth - remian; x += 4)
{
__m128i mmsrcX = _mm_set_epi32(srcX[x + 3], srcX[x + 2], srcX[x+1], srcX[x]);
__m128i srcPosIndices = _mm_add_epi32(
_mm_set1_epi32(srcY[y] * m_nDataWidth),
mmsrcX);
__m128i desPosIndices = _mm_add_epi32(
_mm_set1_epi32(y * nZoomDataWidth),
_mm_set_epi32(x + 3, x + 2, x + 1, x)
);
dataZoom[desPosIndices.m128i_i32[0]] = m_pData[srcPosIndices.m128i_i32[0]];
dataZoom[desPosIndices.m128i_i32[1]] = m_pData[srcPosIndices.m128i_i32[1]];
dataZoom[desPosIndices.m128i_i32[2]] = m_pData[srcPosIndices.m128i_i32[2]];
dataZoom[desPosIndices.m128i_i32[3]] = m_pData[srcPosIndices.m128i_i32[3]];
/*cout << "srcPosIndices: " << srcPosIndices.m128i_i32[0] << " , desPosIndices : " << desPosIndices.m128i_i32[0] << endl;
cout << "srcPosIndices: " << srcPosIndices.m128i_i32[1] << " , desPosIndices : " << desPosIndices.m128i_i32[1] << endl;
cout << "srcPosIndices: " << srcPosIndices.m128i_i32[2] << " , desPosIndices : " << desPosIndices.m128i_i32[2] << endl;
cout << "srcPosIndices: " << srcPosIndices.m128i_i32[3] << " , desPosIndices : " << desPosIndices.m128i_i32[3] << endl;*/
}
// Process the remaining elements (if any) without SSE
for (size_t x = nZoomDataWidth - remian; x < nZoomDataWidth; x++)
{
int srcy = std::min(int(y / fYZoom), m_nDataHeight - 1);
int srcx = std::min(int(x / fXZoom), m_nDataWidth - 1);
int srcPos = srcy * m_nDataHeight + srcx;
int desPos = y * nZoomDataHeight + x;
dataZoom[desPos] = m_pData[srcPos];
}
}
delete[] srcX;
delete[] srcY;
}
完整的代碼
#pragma once
class zoomBlock
{
public:
zoomBlock() {};
~zoomBlock();
void zoomDataSSE128(unsigned char* dataZoom, float fXZoom, float fYZoom);
void zoomData(unsigned char* dataZoom, float fXZoom, float fYZoom);
void test(float fXZoom =0.5, float fYZoom=0.5);
void init(int DataWidth, int DataHeight);
private:
inline void calculateSrcIndex(int* srcValues, int size, float zoom, int max);
private:
unsigned char* m_pData = nullptr;
float m_fXZoom = 1 ;//x軸縮放比例 m_nXZoom=1時 不縮放
float m_fYZoom = 1 ;//y軸縮放比例
int m_nDataWidth = 0;
int m_nDataHeight = 0;
};
#include "zoomBlock.h"
#include
#include
#include
#include
using namespace std;
#define SAFE_DELETE_ARRAY(p) { if( (p) != NULL ) delete[] (p); (p) = NULL; }
zoomBlock::~zoomBlock()
{
SAFE_DELETE_ARRAY(m_pData);
}
void zoomBlock::init(int DataWidth, int DataHeight)
{
m_nDataWidth = DataWidth;
m_nDataHeight = DataHeight;
m_pData = new unsigned char[m_nDataWidth* m_nDataHeight];
for (int i = 0; i < m_nDataWidth * m_nDataHeight; ++i)
{
m_pData[i] = static_cast
}
}
void zoomBlock::zoomData(unsigned char* dataZoom, float fXZoom, float fYZoom)
{
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
for (size_t y = 0; y < nZoomDataHeight; y++)
{
for (size_t x = 0; x < nZoomDataWidth; x ++)
{
//1
int srcy = std::min(int(y / fYZoom), m_nDataHeight - 1);
int srcx = std::min(int(x / fXZoom), m_nDataWidth - 1);
//2
int srcPos = srcy * m_nDataWidth + srcx;
int desPos = y * nZoomDataWidth + x;
dataZoom[desPos] = m_pData[srcPos];
}
}
}
inline void zoomBlock::calculateSrcIndex(int* srcValues, int size, float zoom,int max)
{
__m128i mmIndex, mmSrcValue, mmMax;
mmMax = _mm_set1_epi32(max);
float zoomReciprocal = 1.0f / zoom;
int remian = size % 4;
for (size_t i = 0; i < size - remian; i += 4)
{
mmIndex = _mm_set_epi32(i + 3, i + 2, i + 1, i);
mmSrcValue = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(mmIndex), _mm_set1_ps(zoomReciprocal)));
// Ensure srcValues are within the valid range [0, max]
mmSrcValue = _mm_min_epi32(mmSrcValue, mmMax);
// Store the result to the srcValues array
_mm_storeu_si128(reinterpret_cast<__m128i*>(&srcValues[i]), mmSrcValue);
}
// Process the remaining elements (if any) without SSE
for (size_t i = size - remian; i < size; i++)
{
srcValues[i] = std::min(int(i / zoom), max);
}
}
void zoomBlock::zoomDataSSE128(unsigned char* dataZoom, float fXZoom, float fYZoom)
{
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
int* srcX = new int[nZoomDataWidth];
int* srcY = new int[nZoomDataHeight];
calculateSrcIndex(srcX, nZoomDataWidth, fXZoom, m_nDataWidth - 1);
calculateSrcIndex(srcY, nZoomDataHeight, fYZoom, m_nDataHeight - 1);
for (size_t y = 0; y < nZoomDataHeight; y++)
{
int remian = nZoomDataWidth % 4;
for (size_t x = 0; x < nZoomDataWidth - remian; x += 4)
{
/*int srcPos = srcx * m_nDataHeight + srcy;
int desPos = row * nZoomDataHeight + column;*/
//dataZoom[desPos] = m_pData[srcPos];
//__m128i mmsrcY = _mm_loadu_si128((__m128i*)(srcY));
__m128i mmsrcX = _mm_set_epi32(srcX[x + 3], srcX[x + 2], srcX[x+1], srcX[x]);
__m128i srcPosIndices = _mm_add_epi32(
_mm_set1_epi32(srcY[y] * m_nDataWidth),
mmsrcX);
__m128i desPosIndices = _mm_add_epi32(
_mm_set1_epi32(y * nZoomDataWidth),
_mm_set_epi32(x + 3, x + 2, x + 1, x)
);
dataZoom[desPosIndices.m128i_i32[0]] = m_pData[srcPosIndices.m128i_i32[0]];
dataZoom[desPosIndices.m128i_i32[1]] = m_pData[srcPosIndices.m128i_i32[1]];
dataZoom[desPosIndices.m128i_i32[2]] = m_pData[srcPosIndices.m128i_i32[2]];
dataZoom[desPosIndices.m128i_i32[3]] = m_pData[srcPosIndices.m128i_i32[3]];
/*cout << "srcPosIndices: " << srcPosIndices.m128i_i32[0] << " , desPosIndices : " << desPosIndices.m128i_i32[0] << endl;
cout << "srcPosIndices: " << srcPosIndices.m128i_i32[1] << " , desPosIndices : " << desPosIndices.m128i_i32[1] << endl;
cout << "srcPosIndices: " << srcPosIndices.m128i_i32[2] << " , desPosIndices : " << desPosIndices.m128i_i32[2] << endl;
cout << "srcPosIndices: " << srcPosIndices.m128i_i32[3] << " , desPosIndices : " << desPosIndices.m128i_i32[3] << endl;*/
}
// Process the remaining elements (if any) without SSE
for (size_t x = nZoomDataWidth - remian; x < nZoomDataWidth; x++)
{
int srcy = std::min(int(y / fYZoom), m_nDataHeight - 1);
int srcx = std::min(int(x / fXZoom), m_nDataWidth - 1);
int srcPos = srcy * m_nDataHeight + srcx;
int desPos = y * nZoomDataHeight + x;
dataZoom[desPos] = m_pData[srcPos];
}
}
delete[] srcX;
delete[] srcY;
}
void zoomBlock::test(float fXZoom, float fYZoom)
{
init(8,4);
std::cout << "Values in m_pData:" << std::endl;
for (int i = 0; i < m_nDataWidth * m_nDataHeight; ++i)
{
std::cout << std::setw(4) << static_cast
if ((i + 1) % m_nDataWidth == 0)
{ // Adjust the value based on your data
std::cout << std::endl;
}
}
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
unsigned char* dataZoom = new unsigned char[nZoomDataWidth * nZoomDataHeight];
zoomDataSSE128(dataZoom, fXZoom, fYZoom);
//zoomData(dataZoom, fXZoom, fYZoom);
// Print or inspect the values in m_dataZoom
std::cout << "Values in m_dataZoom:" << std::endl;
for (int i = 0; i < nZoomDataHeight * nZoomDataWidth; ++i)
{
std::cout << std::setw(4)<< static_cast
if ((i + 1) % nZoomDataWidth == 0) { // Adjust the value based on your data
std::cout << std::endl;
}
}
SAFE_DELETE_ARRAY(dataZoom);
}
int main()
{
zoomBlock zoomBlocktest;
zoomBlocktest.test(2,1);
return 0;
}
AVX 256
inline void zoomBlock::calculateSrcIndex256(int* srcValues, int size, float zoom, int max)
{
__m256i ymmIndex, ymmSrcValue, ymmMax;
ymmMax = _mm256_set1_epi32(max);
float zoomReciprocal = 1.0f / zoom;
int remian = size % 8;
for (size_t i = 0; i < size - remian; i += 8)
{
ymmIndex = _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
ymmSrcValue = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(ymmIndex), _mm256_set1_ps(zoomReciprocal)));
// Ensure srcValues are within the valid range [0, max]
ymmSrcValue = _mm256_min_epi32(ymmSrcValue, ymmMax);
// Store the result to the srcValues array
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&srcValues[i]), ymmSrcValue);
}
// Process the remaining elements (if any) without AVX2
for (size_t i = size - remian; i < size; i++)
{
srcValues[i] = std::min(int(i / zoom), max);
}
}
void zoomBlock::zoomDataAVX2(unsigned char* dataZoom, float fXZoom, float fYZoom)
{
int nZoomDataWidth = fXZoom * m_nDataWidth;
int nZoomDataHeight = fYZoom * m_nDataHeight;
int* srcX = new int[nZoomDataWidth];
int* srcY = new int[nZoomDataHeight];
calculateSrcIndex(srcX, nZoomDataWidth, fXZoom, m_nDataWidth - 1);
calculateSrcIndex(srcY, nZoomDataHeight, fYZoom, m_nDataHeight - 1);
for (size_t y = 0; y < nZoomDataHeight; y++)
{
int remian = nZoomDataWidth % 8;
for (size_t x = 0; x < nZoomDataWidth - remian; x += 8)
{
__m256i ymmSrcX = _mm256_set_epi32(srcX[x + 7], srcX[x + 6], srcX[x + 5], srcX[x + 4],
srcX[x + 3], srcX[x + 2], srcX[x + 1], srcX[x]);
__m256i srcPosIndices = _mm256_add_epi32(
_mm256_set1_epi32(srcY[y] * m_nDataWidth),
ymmSrcX);
__m256i desPosIndices = _mm256_add_epi32(
_mm256_set1_epi32(y * nZoomDataWidth),
_mm256_set_epi32(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x));
dataZoom[desPosIndices.m256i_i32[0]] = m_pData[srcPosIndices.m256i_i32[0]];
dataZoom[desPosIndices.m256i_i32[1]] = m_pData[srcPosIndices.m256i_i32[1]];
dataZoom[desPosIndices.m256i_i32[2]] = m_pData[srcPosIndices.m256i_i32[2]];
dataZoom[desPosIndices.m256i_i32[3]] = m_pData[srcPosIndices.m256i_i32[3]];
dataZoom[desPosIndices.m256i_i32[4]] = m_pData[srcPosIndices.m256i_i32[4]];
dataZoom[desPosIndices.m256i_i32[5]] = m_pData[srcPosIndices.m256i_i32[5]];
dataZoom[desPosIndices.m256i_i32[6]] = m_pData[srcPosIndices.m256i_i32[6]];
dataZoom[desPosIndices.m256i_i32[7]] = m_pData[srcPosIndices.m256i_i32[7]];
}
// Process the remaining elements (if any) without AVX2
for (size_t x = nZoomDataWidth - remian; x < nZoomDataWidth; x++)
{
int srcy = std::min(int(y / fYZoom), m_nDataHeight - 1);
int srcx = std::min(int(x / fXZoom), m_nDataWidth - 1);
int srcPos = srcy * m_nDataWidth + srcx;
int desPos = y * nZoomDataWidth + x;
dataZoom[desPos] = m_pData[srcPos];
}
}
delete[] srcX;
delete[] srcY;
}
柚子快報激活碼778899分享:縮放算法優(yōu)化步驟詳解
推薦鏈接
本文內(nèi)容根據(jù)網(wǎng)絡資料整理,出于傳遞更多信息之目的,不代表金鑰匙跨境贊同其觀點和立場。
轉(zhuǎn)載請注明,如有侵權(quán),聯(lián)系刪除。