summaryrefslogtreecommitdiff
path: root/sys/src/libsec/amd64
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@felloff.net>2017-11-20 00:10:35 +0100
committercinap_lenrek <cinap_lenrek@felloff.net>2017-11-20 00:10:35 +0100
commit077e719dfbf9bf2582bed80026251cc0d108c16e (patch)
tree5e8fd7d6297f5d007ea21c85f8346eae0717ed7d /sys/src/libsec/amd64
parent1eb373945455f1ba03fa1b221529d74ca2a778ad (diff)
libsec: write optimized _chachablock() function for amd64 / sse2
doing 4 quarterround's in parallel using 128-bit vector registers. for second round shuffle the columns and then shuffle back. code is rather obvious. only trick here is for the first quaterround PSHUFLW/PSHUFHW is used to swap the halfwords for the <<<16 rotation.
Diffstat (limited to 'sys/src/libsec/amd64')
-rw-r--r--sys/src/libsec/amd64/chachablock.s74
-rw-r--r--sys/src/libsec/amd64/mkfile1
2 files changed, 75 insertions, 0 deletions
diff --git a/sys/src/libsec/amd64/chachablock.s b/sys/src/libsec/amd64/chachablock.s
new file mode 100644
index 000000000..d098c4425
--- /dev/null
+++ b/sys/src/libsec/amd64/chachablock.s
@@ -0,0 +1,74 @@
+#define ROTATE(n, v1, v2) \
+ MOVO v1, v2; \
+ PSLLL $(n), v1; \
+ PSRLL $(32-n), v2; \
+ POR v1, v2
+
+TEXT _chachablock(SB), 0, $0
+ MOVOU 0(RARG), X0
+ MOVOU 16(RARG), X1
+ MOVOU 32(RARG), X2
+ MOVOU 48(RARG), X3
+
+ MOVL rounds+8(FP), CX
+ SHRL $1, CX
+
+_loop:
+ PADDL X1, X0
+ PXOR X0, X3
+ /* ROTATE(16, X3, X3) */
+ PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+ PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(12, X4, X1)
+
+ PADDL X1, X0
+ MOVO X0, X4
+ PXOR X3, X4
+ ROTATE(8, X4, X3)
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(7, X4, X1)
+
+ PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1
+ PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
+ PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3
+
+ PADDL X1, X0
+ PXOR X0, X3
+ /* ROTATE(16, X3, X3) */
+ PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+ PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(12, X4, X1)
+
+ PADDL X1, X0
+ MOVO X0, X4
+ PXOR X3, X4
+ ROTATE(8, X4, X3)
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(7, X4, X1)
+
+ PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1
+ PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
+ PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3
+
+ DECL CX
+ JNE _loop
+
+ MOVOU X0, 0(RARG)
+ MOVOU X1, 16(RARG)
+ MOVOU X2, 32(RARG)
+ MOVOU X3, 48(RARG)
+ RET
diff --git a/sys/src/libsec/amd64/mkfile b/sys/src/libsec/amd64/mkfile
index 990d35aa4..633fde1fe 100644
--- a/sys/src/libsec/amd64/mkfile
+++ b/sys/src/libsec/amd64/mkfile
@@ -3,6 +3,7 @@ objtype=amd64
LIB=/$objtype/lib/libsec.a
FILES=\
+ chachablock\
md5block\
sha1block\
aesni\