diff options
author | cinap_lenrek <cinap_lenrek@felloff.net> | 2017-11-20 00:10:35 +0100 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@felloff.net> | 2017-11-20 00:10:35 +0100 |
commit | 077e719dfbf9bf2582bed80026251cc0d108c16e (patch) | |
tree | 5e8fd7d6297f5d007ea21c85f8346eae0717ed7d /sys/src/libsec/amd64 | |
parent | 1eb373945455f1ba03fa1b221529d74ca2a778ad (diff) |
libsec: write optimized _chachablock() function for amd64 / sse2
doing 4 quarterround's in parallel using 128-bit
vector registers. for second round shuffle the columns and
then shuffle back.
code is rather obvious. only trick here is for the first
quaterround PSHUFLW/PSHUFHW is used to swap the halfwords
for the <<<16 rotation.
Diffstat (limited to 'sys/src/libsec/amd64')
-rw-r--r-- | sys/src/libsec/amd64/chachablock.s | 74 | ||||
-rw-r--r-- | sys/src/libsec/amd64/mkfile | 1 |
2 files changed, 75 insertions, 0 deletions
diff --git a/sys/src/libsec/amd64/chachablock.s b/sys/src/libsec/amd64/chachablock.s new file mode 100644 index 000000000..d098c4425 --- /dev/null +++ b/sys/src/libsec/amd64/chachablock.s @@ -0,0 +1,74 @@ +#define ROTATE(n, v1, v2) \ + MOVO v1, v2; \ + PSLLL $(n), v1; \ + PSRLL $(32-n), v2; \ + POR v1, v2 + +TEXT _chachablock(SB), 0, $0 + MOVOU 0(RARG), X0 + MOVOU 16(RARG), X1 + MOVOU 32(RARG), X2 + MOVOU 48(RARG), X3 + + MOVL rounds+8(FP), CX + SHRL $1, CX + +_loop: + PADDL X1, X0 + PXOR X0, X3 + /* ROTATE(16, X3, X3) */ + PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(12, X4, X1) + + PADDL X1, X0 + MOVO X0, X4 + PXOR X3, X4 + ROTATE(8, X4, X3) + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(7, X4, X1) + + PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1 + PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2 + PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3 + + PADDL X1, X0 + PXOR X0, X3 + /* ROTATE(16, X3, X3) */ + PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(12, X4, X1) + + PADDL X1, X0 + MOVO X0, X4 + PXOR X3, X4 + ROTATE(8, X4, X3) + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(7, X4, X1) + + PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1 + PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2 + PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3 + + DECL CX + JNE _loop + + MOVOU X0, 0(RARG) + MOVOU X1, 16(RARG) + MOVOU X2, 32(RARG) + MOVOU X3, 48(RARG) + RET diff --git a/sys/src/libsec/amd64/mkfile b/sys/src/libsec/amd64/mkfile index 990d35aa4..633fde1fe 100644 --- a/sys/src/libsec/amd64/mkfile +++ b/sys/src/libsec/amd64/mkfile @@ -3,6 +3,7 @@ objtype=amd64 LIB=/$objtype/lib/libsec.a FILES=\ + chachablock\ md5block\ sha1block\ aesni\ |