Wednesday, July 23, 2014

Transpose matrix with SSE2

Output:
Matrix:
| 1  2  3  4 |
| 5  6  7  8 |
| 8  7  6  5 |
| 4  3  2  1 |

Matrix, transposed:
| 1  5  8  4 |
| 2  6  7  3 |
| 3  7  6  2 |
| 4  8  5  1 |


Code:
/*Please refactor identifiers, as you like*/

#define eCG/*G++*/
/*#define eCV*//*VC++*/

#ifdef eCG
#define okN(p)__attribute__((aligned(p)))
#elif defined eCV
#define okN(p)__declspec(align(p))
#endif

#include<emmintrin.h>
#include<iostream>

struct okN(0x10) tR4x4{
float m[0x10];/*Column-major, not row-major -- OpenGL FTW*/

#define o_(a,b)\
float N##a##b()const noexcept{return m[((b-1)<<2)+(a-1)];}\
float&N##a##b()noexcept{return m[((b-1)<<2)+(a-1)];}
o_(1,1)o_(1,2)o_(1,3)o_(1,4)
o_(2,1)o_(2,2)o_(2,3)o_(2,4)
o_(3,1)o_(3,2)o_(3,3)o_(3,4)
o_(4,1)o_(4,2)o_(4,3)o_(4,4)
#undef o_

tR4x4(
float p11,float p12,float p13,float p14,
float p21,float p22,float p23,float p24,
float p31,float p32,float p33,float p34,
float p41,float p42,float p43,float p44
)noexcept{
#define o_(a,b)N##a##b()=p##a##b;
o_(1,1)o_(1,2)o_(1,3)o_(1,4)
o_(2,1)o_(2,2)o_(2,3)o_(2,4)
o_(3,1)o_(3,2)o_(3,3)o_(3,4)
o_(4,1)o_(4,2)o_(4,3)o_(4,4)
#undef o_
}

tR4x4(__m128i const&a,__m128i const&b,__m128i const&c,__m128i const&d)noexcept{
((__m128i*)this)[0]=a,((__m128i*)this)[1]=b,
((__m128i*)this)[2]=c,((__m128i*)this)[3]=d;}

#if 0/*Without SIMD SSE2*/
tR4x4 kTp()const noexcept{return tR4x4(
N11(),N21(),N31(),N41(),
N12(),N22(),N32(),N42(),
N13(),N23(),N33(),N43(),
N14(),N24(),N34(),N44());}
#endif

tR4x4 kTp()const noexcept{__m128i const
l1=_mm_unpacklo_epi32(((__m128i*)this)[0],((__m128i*)this)[2]),
l2=_mm_unpacklo_epi32(((__m128i*)this)[1],((__m128i*)this)[3]),
l3=_mm_unpackhi_epi32(((__m128i*)this)[0],((__m128i*)this)[2]),
l4=_mm_unpackhi_epi32(((__m128i*)this)[1],((__m128i*)this)[3]);
return tR4x4(_mm_unpacklo_epi32(l1,l2),_mm_unpackhi_epi32(l1,l2),_mm_unpacklo_epi32(l3,l4),_mm_unpackhi_epi32(l3,l4));}

friend std::ostream&operator<<(std::ostream&,tR4x4 const&)noexcept;};

std::ostream&operator<<(std::ostream&q,tR4x4 const&p)noexcept{q
<<"| "<<p.N11()<<"  "<<p.N12()<<"  "<<p.N13()<<"  "<<p.N14()<<" |\n"
<<"| "<<p.N21()<<"  "<<p.N22()<<"  "<<p.N23()<<"  "<<p.N24()<<" |\n"
<<"| "<<p.N31()<<"  "<<p.N32()<<"  "<<p.N33()<<"  "<<p.N34()<<" |\n"
<<"| "<<p.N41()<<"  "<<p.N42()<<"  "<<p.N43()<<"  "<<p.N44()<<" |\n";return q;}

void f()noexcept{
tR4x4 l(
1.F,2.F,3.F,4.F,
5.F,6.F,7.F,8.F,
8.F,7.F,6.F,5.F,
4.F,3.F,2.F,1.F);

std::cout<<"Matrix:\n"<<l<<std::endl;
std::cout<<"Matrix, transposed:\n"<<l.kTp()<<std::endl;}

int main(){f();return 0;}

No comments:

Post a Comment