From 9775eac1fd95d1ca0682937d0549af79186695f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:53:27 +0000 Subject: [PATCH] chore: bump google.golang.org/api from 0.282.0 to 0.285.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.282.0 to 0.285.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.282.0...v0.285.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-version: 0.285.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 16 +- go.sum | 32 +- .../chacha20poly1305_amd64.go | 6 +- .../chacha20poly1305/chacha20poly1305_amd64.s | 8664 ++++------------- vendor/golang.org/x/net/http2/server_wrap.go | 16 + .../golang.org/x/net/http2/transport_wrap.go | 15 +- vendor/golang.org/x/sync/errgroup/errgroup.go | 2 +- .../golang.org/x/sync/semaphore/semaphore.go | 10 +- vendor/golang.org/x/sys/unix/ztypes_linux.go | 76 + .../golang.org/x/sys/unix/ztypes_linux_386.go | 4 + .../x/sys/unix/ztypes_linux_amd64.go | 4 + .../golang.org/x/sys/unix/ztypes_linux_arm.go | 4 + .../x/sys/unix/ztypes_linux_arm64.go | 4 + .../x/sys/unix/ztypes_linux_loong64.go | 4 + .../x/sys/unix/ztypes_linux_mips.go | 4 + .../x/sys/unix/ztypes_linux_mips64.go | 4 + .../x/sys/unix/ztypes_linux_mips64le.go | 4 + .../x/sys/unix/ztypes_linux_mipsle.go | 4 + .../golang.org/x/sys/unix/ztypes_linux_ppc.go | 4 + .../x/sys/unix/ztypes_linux_ppc64.go | 4 + .../x/sys/unix/ztypes_linux_ppc64le.go | 4 + .../x/sys/unix/ztypes_linux_riscv64.go | 4 + .../x/sys/unix/ztypes_linux_s390x.go | 4 + .../x/sys/unix/ztypes_linux_sparc64.go | 4 + .../google.golang.org/api/internal/version.go | 2 +- vendor/modules.txt | 16 +- 26 files changed, 2273 insertions(+), 6642 deletions(-) diff --git a/go.mod b/go.mod index 1d55190a8a..b719e26797 100644 --- a/go.mod +++ b/go.mod @@ -41,7 +41,7 @@ require ( go.uber.org/zap v1.28.0 golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 golang.org/x/mod v0.37.0 - google.golang.org/api v0.282.0 + google.golang.org/api v0.285.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.35.5 k8s.io/apiextensions-apiserver v0.35.5 @@ -151,19 +151,19 @@ require ( go.uber.org/atomic v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.51.0 // indirect - golang.org/x/net v0.55.0 // indirect + golang.org/x/crypto v0.53.0 // indirect + golang.org/x/net v0.56.0 // indirect golang.org/x/oauth2 v0.36.0 // indirect - golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.45.0 // indirect - golang.org/x/term v0.43.0 // indirect - golang.org/x/text v0.37.0 // indirect + golang.org/x/sync v0.21.0 // indirect + golang.org/x/sys v0.46.0 // indirect + golang.org/x/term v0.44.0 // indirect + golang.org/x/text v0.38.0 // indirect golang.org/x/time v0.15.0 // indirect golang.org/x/tools v0.45.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad // indirect google.golang.org/grpc v1.81.1 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect diff --git a/go.sum b/go.sum index 9b599312c3..49a91f0779 100644 --- a/go.sum +++ b/go.sum @@ -360,8 +360,8 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 h1:1UoZQm6f0P/ZO0w1Ri+f+ifG/gXhegadRdwBIXEFWDo= golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67/go.mod h1:qj5a5QZpwLU2NLQudwIN5koi3beDhSAlJwa67PuM98c= @@ -376,8 +376,8 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= -golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= +golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= +golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= @@ -385,8 +385,8 @@ golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= -golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= +golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -394,14 +394,14 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= -golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= -golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= -golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= +golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= +golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= +golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -420,8 +420,8 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= -google.golang.org/api v0.282.0 h1:WmJiSVqUnKqJCpJOx7YADbXaC+9DDsnGSfllFSj7R2I= -google.golang.org/api v0.282.0/go.mod h1:6Wssta4c5n9qHq5CBhmlai5h/PUa1djdDAIhYEHyvcM= +google.golang.org/api v0.285.0 h1:B7eHHoKGAX/LrPkQvhQqnGwjgWxofbdGwCTQvpm8FkM= +google.golang.org/api v0.285.0/go.mod h1:NlOlUIr8MPoIhT9Bb/oUnRuHbJOLwxb6JSYJM8Yz+jQ= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -431,8 +431,8 @@ google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgn google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:L43LFes82YgSonw6iTXTxXUX1OlULt4AQtkik4ULL/I= google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa h1:Kjn0N0tCrDgiAFW+lGO4JZ3ck44CehvJQMAwj9QF0G8= google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa/go.mod h1:q4lMZS6kskjT5HvCPrnnypcDPVJqT/f4nfxmkE7gryY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa h1:mZHHdPZl0dbGHCflZgAq/Q468DWVFcU2whhB2KAo8fk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad h1:45WmJvIV6C2+O/jjLkPUH+F3aOj/1miDoU2DD0+NWbg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go index b850e772e1..bfe546b60a 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -20,7 +20,7 @@ func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) var ( - useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 + useAVX2 = cpu.X86.HasSSSE3 && cpu.X86.HasAVX2 && cpu.X86.HasBMI2 ) // setupState writes a ChaCha20 input matrix to state. See @@ -47,7 +47,7 @@ func setupState(state *[16]uint32, key *[32]byte, nonce []byte) { } func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { - if !cpu.X86.HasSSSE3 { + if !useAVX2 { return c.sealGeneric(dst, nonce, plaintext, additionalData) } @@ -66,7 +66,7 @@ func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) [] } func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { - if !cpu.X86.HasSSSE3 { + if !useAVX2 { return c.openGeneric(dst, nonce, ciphertext, additionalData) } diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index fd5ee845f9..c703c13471 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -6,27 +6,6 @@ // func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // Hack: Must declare #define macros inside of a function due to Avo constraints - // ROL rotates the uint32s in register R left by N bits, using temporary T. - #define ROL(N, R, T) \ - MOVO R, T; \ - PSLLL $(N), T; \ - PSRLL $(32-(N)), R; \ - PXOR T, R - - // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. - #ifdef GOAMD64_v2 - #define ROL8(R, T) PSHUFB ·rol8<>(SB), R - #else - #define ROL8(R, T) ROL(8, R, T) - #endif - - // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. - #ifdef GOAMD64_v2 - #define ROL16(R, T) PSHUFB ·rol16<>(SB), R - #else - #define ROL16(R, T) ROL(16, R, T) - #endif XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 @@ -192,676 +171,112 @@ hashADDone: // Requires: AVX, AVX2, BMI2, CMOV, SSE2 TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access - MOVQ SP, BP - ADDQ $0x20, BP - ANDQ $-32, BP - MOVQ dst_base+0(FP), DI - MOVQ key_base+24(FP), R8 - MOVQ src_base+48(FP), SI - MOVQ src_len+56(FP), BX - MOVQ ad_base+72(FP), CX - - // Check for AVX2 support - CMPB ·useAVX2+0(SB), $0x01 - JE chacha20Poly1305Open_AVX2 + MOVQ SP, BP + ADDQ $0x20, BP + ANDQ $-32, BP + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + VZEROUPPER + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VBROADCASTI128 16(R8), Y14 + VBROADCASTI128 32(R8), Y12 + VBROADCASTI128 48(R8), Y4 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ BX, $0x80 - JBE openSSE128 - - // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X9, X13 - - // Store state on stack for future use - MOVO X3, 32(BP) - MOVO X6, 48(BP) - MOVO X9, 128(BP) - MOVQ $0x0000000a, R9 - -openSSEPreparePolyKey: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - DECQ R9 - JNE openSSEPreparePolyKey + CMPQ BX, $0xc0 + JBE openAVX2192 + CMPQ BX, $0x00000140 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 + +openAVX2PreparePolyKey: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL 32(BP), X3 + // Clamp and store poly key + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVO X0, (BP) - MOVO X3, 16(BP) + // Stream for the first 64 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 - // Hash AAD + // Hash AD + first 64 bytes MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) + XORQ CX, CX -openSSEMainLoop: - CMPQ BX, $0x00000100 - JB openSSEMainLoopDone - - // Load state, increment counter blocks - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash - // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $0x00000004, CX - MOVQ SI, R9 - -openSSEInternalLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - LEAQ 16(R9), R9 - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ CX - JGE openSSEInternalLoop - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(R9), R9 - CMPQ CX, $-6 - JG openSSEInternalLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - - // Load - xor - store - MOVO X15, 64(BP) - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU X0, (DI) - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU X3, 16(DI) - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU X6, 32(DI) - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X9, 48(DI) - MOVOU 64(SI), X9 - PXOR X9, X1 - MOVOU X1, 64(DI) - MOVOU 80(SI), X9 - PXOR X9, X4 - MOVOU X4, 80(DI) - MOVOU 96(SI), X9 - PXOR X9, X7 - MOVOU X7, 96(DI) - MOVOU 112(SI), X9 - PXOR X9, X10 - MOVOU X10, 112(DI) - MOVOU 128(SI), X9 - PXOR X9, X2 - MOVOU X2, 128(DI) - MOVOU 144(SI), X9 - PXOR X9, X5 - MOVOU X5, 144(DI) - MOVOU 160(SI), X9 - PXOR X9, X8 - MOVOU X8, 160(DI) - MOVOU 176(SI), X9 - PXOR X9, X11 - MOVOU X11, 176(DI) - MOVOU 192(SI), X9 - PXOR X9, X12 - MOVOU X12, 192(DI) - MOVOU 208(SI), X9 - PXOR X9, X13 - MOVOU X13, 208(DI) - MOVOU 224(SI), X9 - PXOR X9, X14 - MOVOU X14, 224(DI) - MOVOU 240(SI), X9 - PXOR 64(BP), X9 - MOVOU X9, 240(DI) - LEAQ 256(SI), SI - LEAQ 256(DI), DI - SUBQ $0x00000100, BX - JMP openSSEMainLoop - -openSSEMainLoopDone: - // Handle the various tail sizes efficiently - TESTQ BX, BX - JE openSSEFinalize - CMPQ BX, $0x40 - JBE openSSETail64 - CMPQ BX, $0x80 - JBE openSSETail128 - CMPQ BX, $0xc0 - JBE openSSETail192 - JMP openSSETail256 - -openSSEFinalize: - // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), R10 - ADCQ src_len+56(FP), R11 +openAVX2InitialHash64: + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -878,3187 +293,494 @@ openSSEFinalize: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 - // Final reduce - MOVQ R10, R13 - MOVQ R11, R14 - MOVQ R12, R15 - SUBQ $-5, R10 - SBBQ $-1, R11 - SBBQ $0x03, R12 - CMOVQCS R13, R10 - CMOVQCS R14, R11 - CMOVQCS R15, R12 - - // Add in the "s" part of the key - ADDQ 16(BP), R10 - ADCQ 24(BP), R11 - - // Finally, constant time compare to the tag at the end of the message - XORQ AX, AX - MOVQ $0x00000001, DX - XORQ (SI), R10 - XORQ 8(SI), R11 - ORQ R11, R10 - CMOVQEQ DX, AX + // Decrypt the first 64 bytes + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX - // Return true iff tags are equal - MOVB AX, ret+96(FP) - RET +openAVX2MainLoop: + CMPQ BX, $0x00000200 + JB openAVX2MainLoopDone -openSSE128: - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X3, X13 - MOVO X6, X14 - MOVO X10, X15 - MOVQ $0x0000000a, R9 - -openSSE128InnerCipherLoop: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ R9 - JNE openSSE128InnerCipherLoop - - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL X13, X3 - PADDL X13, X4 - PADDL X13, X5 - PADDL X14, X7 - PADDL X14, X8 - PADDL X15, X10 - PADDL ·sseIncMask<>+0(SB), X15 - PADDL X15, X11 - - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVOU X0, (BP) - MOVOU X3, 16(BP) + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - -openSSE128Open: - CMPQ BX, $0x10 - JB openSSETail16 - SUBQ $0x10, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - - // Load for decryption - MOVOU (SI), X12 - PXOR X12, X1 - MOVOU X1, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Shift the stream "left" - MOVO X4, X1 - MOVO X7, X4 - MOVO X10, X7 - MOVO X2, X10 - MOVO X5, X2 - MOVO X8, X5 - MOVO X11, X8 - JMP openSSE128Open - -openSSETail16: - TESTQ BX, BX - JE openSSEFinalize - - // We can safely load the CT from the end, because it is padded with the MAC - MOVQ BX, R9 - SHLQ $0x04, R9 - LEAQ ·andMask<>+0(SB), R13 - MOVOU (SI), X12 - ADDQ BX, SI - PAND -16(R13)(R9*1), X12 - MOVO X12, 64(BP) - MOVQ X12, R13 - MOVQ 72(BP), R14 - PXOR X1, X12 - - // We can only store one byte at a time, since plaintext can be shorter than 16 bytes -openSSETail16Store: - MOVQ X12, R8 - MOVB R8, (DI) - PSRLDQ $0x01, X12 - INCQ DI - DECQ BX - JNE openSSETail16Store - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - JMP openSSEFinalize - -openSSETail64: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - XORQ R9, R9 - MOVQ BX, CX - CMPQ CX, $0x10 - JB openSSETail64LoopB - -openSSETail64LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - SUBQ $0x10, CX - -openSSETail64LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - CMPQ CX, $0x10 - JAE openSSETail64LoopA - CMPQ R9, $0xa0 - JNE openSSETail64LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL 32(BP), X3 - PADDL 48(BP), X6 - PADDL 80(BP), X9 - -openSSETail64DecLoop: - CMPQ BX, $0x10 - JB openSSETail64DecLoopDone - SUBQ $0x10, BX - MOVOU (SI), X12 - PXOR X12, X0 - MOVOU X0, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - MOVO X3, X0 - MOVO X6, X3 - MOVO X9, X6 - JMP openSSETail64DecLoop - -openSSETail64DecLoopDone: - MOVO X0, X1 - JMP openSSETail16 - -openSSETail128: - MOVO ·chacha20Constants<>+0(SB), X1 - MOVO 32(BP), X4 - MOVO 48(BP), X7 - MOVO 128(BP), X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 80(BP) - MOVO X1, X0 - MOVO X4, X3 - MOVO X7, X6 - MOVO X10, X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 96(BP) - XORQ R9, R9 - MOVQ BX, CX - ANDQ $-16, CX - -openSSETail128LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSETail128LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - CMPQ R9, CX - JB openSSETail128LoopA - CMPQ R9, $0xa0 - JNE openSSETail128LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 96(BP), X9 - PADDL 80(BP), X10 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, (DI) - MOVOU X4, 16(DI) - MOVOU X7, 32(DI) - MOVOU X10, 48(DI) - SUBQ $0x40, BX - LEAQ 64(SI), SI - LEAQ 64(DI), DI - JMP openSSETail64DecLoop - -openSSETail192: - MOVO ·chacha20Constants<>+0(SB), X2 - MOVO 32(BP), X5 - MOVO 48(BP), X8 - MOVO 128(BP), X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X11, 80(BP) - MOVO X2, X1 - MOVO X5, X4 - MOVO X8, X7 - MOVO X11, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - MOVO X1, X0 - MOVO X4, X3 - MOVO X7, X6 - MOVO X10, X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 112(BP) - MOVQ BX, CX - MOVQ $0x000000a0, R9 - CMPQ CX, $0xa0 - CMOVQGT R9, CX - ANDQ $-16, CX - XORQ R9, R9 - -openSSLTail192LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSLTail192LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - CMPQ R9, CX - JB openSSLTail192LoopA - CMPQ R9, $0xa0 - JNE openSSLTail192LoopB - CMPQ BX, $0xb0 - JB openSSLTail192Store - ADDQ 160(SI), R10 - ADCQ 168(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - CMPQ BX, $0xc0 - JB openSSLTail192Store - ADDQ 176(SI), R10 - ADCQ 184(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSLTail192Store: - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 32(BP), X5 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 48(BP), X8 - PADDL 112(BP), X9 - PADDL 96(BP), X10 - PADDL 80(BP), X11 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X2 - PXOR X13, X5 - PXOR X14, X8 - PXOR X15, X11 - MOVOU X2, (DI) - MOVOU X5, 16(DI) - MOVOU X8, 32(DI) - MOVOU X11, 48(DI) - MOVOU 64(SI), X12 - MOVOU 80(SI), X13 - MOVOU 96(SI), X14 - MOVOU 112(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - SUBQ $0x80, BX - LEAQ 128(SI), SI - LEAQ 128(DI), DI - JMP openSSETail64DecLoop - -openSSETail256: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - XORQ R9, R9 - -openSSETail256Loop: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - ADDQ $0x10, R9 - CMPQ R9, $0xa0 - JB openSSETail256Loop - MOVQ BX, CX - ANDQ $-16, CX - -openSSETail256HashLoop: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ $0x10, R9 - CMPQ R9, CX - JB openSSETail256HashLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - MOVO X15, 64(BP) - - // Load - xor - store - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVOU 128(SI), X0 - MOVOU 144(SI), X3 - MOVOU 160(SI), X6 - MOVOU 176(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 128(DI) - MOVOU X5, 144(DI) - MOVOU X8, 160(DI) - MOVOU X11, 176(DI) - LEAQ 192(SI), SI - LEAQ 192(DI), DI - SUBQ $0xc0, BX - MOVO X12, X0 - MOVO X13, X3 - MOVO X14, X6 - MOVO 64(BP), X9 - JMP openSSETail64DecLoop - -chacha20Poly1305Open_AVX2: - VZEROUPPER - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x70 - BYTE $0x10 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x20 - BYTE $0xc4 - BYTE $0xc2 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x30 - VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 - - // Special optimization, for very short buffers - CMPQ BX, $0xc0 - JBE openAVX2192 - CMPQ BX, $0x00000140 - JBE openAVX2320 - - // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA Y14, 32(BP) - VMOVDQA Y12, 64(BP) - VMOVDQA Y4, 192(BP) - MOVQ $0x0000000a, R9 - -openAVX2PreparePolyKey: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x04, Y4, Y4, Y4 - DECQ R9 - JNE openAVX2PreparePolyKey - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD 32(BP), Y14, Y14 - VPADDD 64(BP), Y12, Y12 - VPADDD 192(BP), Y4, Y4 - VPERM2I128 $0x02, Y0, Y14, Y3 - - // Clamp and store poly key - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for the first 64 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - - // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - XORQ CX, CX - -openAVX2InitialHash64: - ADDQ (SI)(CX*1), R10 - ADCQ 8(SI)(CX*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ $0x10, CX - CMPQ CX, $0x40 - JNE openAVX2InitialHash64 - - // Decrypt the first 64 bytes - VPXOR (SI), Y0, Y0 - VPXOR 32(SI), Y14, Y14 - VMOVDQU Y0, (DI) - VMOVDQU Y14, 32(DI) - LEAQ 64(SI), SI - LEAQ 64(DI), DI - SUBQ $0x40, BX - -openAVX2MainLoop: - CMPQ BX, $0x00000200 - JB openAVX2MainLoopDone - - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA Y0, Y7 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA Y14, Y11 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA Y12, Y15 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) - VMOVDQA Y3, 192(BP) - XORQ CX, CX - -openAVX2InternalLoop: - ADDQ (SI)(CX*1), R10 - ADCQ 8(SI)(CX*1), R11 - ADCQ $0x01, R12 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 - VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - ADDQ 16(SI)(CX*1), R10 - ADCQ 24(SI)(CX*1), R11 - ADCQ $0x01, R12 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 - VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x04, Y11, Y11, Y11 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - VPALIGNR $0x0c, Y3, Y3, Y3 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - ADDQ 32(SI)(CX*1), R10 - ADCQ 40(SI)(CX*1), R11 - ADCQ $0x01, R12 - LEAQ 48(CX), CX - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 - VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 - VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x0c, Y11, Y11, Y11 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - VPALIGNR $0x04, Y3, Y3, Y3 - CMPQ CX, $0x000001e0 - JNE openAVX2InternalLoop - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 32(BP), Y11, Y11 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 64(BP), Y15, Y15 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPADDD 192(BP), Y3, Y3 - VMOVDQA Y15, 224(BP) - - // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - ADDQ 480(SI), R10 - ADCQ 488(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPERM2I128 $0x02, Y0, Y14, Y15 - VPERM2I128 $0x13, Y0, Y14, Y14 - VPERM2I128 $0x02, Y12, Y4, Y0 - VPERM2I128 $0x13, Y12, Y4, Y12 - VPXOR (SI), Y15, Y15 - VPXOR 32(SI), Y0, Y0 - VPXOR 64(SI), Y14, Y14 - VPXOR 96(SI), Y12, Y12 - VMOVDQU Y15, (DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y14, 64(DI) - VMOVDQU Y12, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR 128(SI), Y0, Y0 - VPXOR 160(SI), Y14, Y14 - VPXOR 192(SI), Y12, Y12 - VPXOR 224(SI), Y4, Y4 - VMOVDQU Y0, 128(DI) - VMOVDQU Y14, 160(DI) - VMOVDQU Y12, 192(DI) - VMOVDQU Y4, 224(DI) - - // and here - ADDQ 496(SI), R10 - ADCQ 504(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - VPXOR 256(SI), Y0, Y0 - VPXOR 288(SI), Y14, Y14 - VPXOR 320(SI), Y12, Y12 - VPXOR 352(SI), Y4, Y4 - VMOVDQU Y0, 256(DI) - VMOVDQU Y14, 288(DI) - VMOVDQU Y12, 320(DI) - VMOVDQU Y4, 352(DI) - VPERM2I128 $0x02, Y7, Y11, Y0 - VPERM2I128 $0x02, 224(BP), Y3, Y14 - VPERM2I128 $0x13, Y7, Y11, Y12 - VPERM2I128 $0x13, 224(BP), Y3, Y4 - VPXOR 384(SI), Y0, Y0 - VPXOR 416(SI), Y14, Y14 - VPXOR 448(SI), Y12, Y12 - VPXOR 480(SI), Y4, Y4 - VMOVDQU Y0, 384(DI) - VMOVDQU Y14, 416(DI) - VMOVDQU Y12, 448(DI) - VMOVDQU Y4, 480(DI) - LEAQ 512(SI), SI - LEAQ 512(DI), DI - SUBQ $0x00000200, BX - JMP openAVX2MainLoop - -openAVX2MainLoopDone: - // Handle the various tail sizes efficiently - TESTQ BX, BX - JE openSSEFinalize - CMPQ BX, $0x80 - JBE openAVX2Tail128 - CMPQ BX, $0x00000100 - JBE openAVX2Tail256 - CMPQ BX, $0x00000180 - JBE openAVX2Tail384 - JMP openAVX2Tail512 - -openAVX2192: - VMOVDQA Y0, Y5 - VMOVDQA Y14, Y9 - VMOVDQA Y12, Y13 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y0, Y6 - VMOVDQA Y14, Y10 - VMOVDQA Y12, Y8 - VMOVDQA Y4, Y2 - VMOVDQA Y1, Y15 - MOVQ $0x0000000a, R9 - -openAVX2192InnerCipherLoop: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - DECQ R9 - JNE openAVX2192InnerCipherLoop - VPADDD Y6, Y0, Y0 - VPADDD Y6, Y5, Y5 - VPADDD Y10, Y14, Y14 - VPADDD Y10, Y9, Y9 - VPADDD Y8, Y12, Y12 - VPADDD Y8, Y13, Y13 - VPADDD Y2, Y4, Y4 - VPADDD Y15, Y1, Y1 - VPERM2I128 $0x02, Y0, Y14, Y3 - - // Clamp and store poly key - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for up to 192 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - VPERM2I128 $0x02, Y5, Y9, Y12 - VPERM2I128 $0x02, Y13, Y1, Y4 - VPERM2I128 $0x13, Y5, Y9, Y5 - VPERM2I128 $0x13, Y13, Y1, Y9 - -openAVX2ShortOpen: - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - -openAVX2ShortOpenLoop: - CMPQ BX, $0x20 - JB openAVX2ShortTail32 - SUBQ $0x20, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ 16(SI), R10 - ADCQ 24(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Load for decryption - VPXOR (SI), Y0, Y0 - VMOVDQU Y0, (DI) - LEAQ 32(SI), SI - LEAQ 32(DI), DI - - // Shift stream left - VMOVDQA Y14, Y0 - VMOVDQA Y12, Y14 - VMOVDQA Y4, Y12 - VMOVDQA Y5, Y4 - VMOVDQA Y9, Y5 - VMOVDQA Y13, Y9 - VMOVDQA Y1, Y13 - VMOVDQA Y6, Y1 - VMOVDQA Y10, Y6 - JMP openAVX2ShortOpenLoop - -openAVX2ShortTail32: - CMPQ BX, $0x10 - VMOVDQA X0, X1 - JB openAVX2ShortDone - SUBQ $0x10, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Load for decryption - VPXOR (SI), X0, X12 - VMOVDQU X12, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - VPERM2I128 $0x11, Y0, Y0, Y0 - VMOVDQA X0, X1 - -openAVX2ShortDone: - VZEROUPPER - JMP openSSETail16 - -openAVX2320: - VMOVDQA Y0, Y5 - VMOVDQA Y14, Y9 - VMOVDQA Y12, Y13 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y0, Y6 - VMOVDQA Y14, Y10 - VMOVDQA Y12, Y8 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VMOVDQA Y14, Y7 - VMOVDQA Y12, Y11 - VMOVDQA Y4, Y15 - MOVQ $0x0000000a, R9 - -openAVX2320InnerCipherLoop: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - DECQ R9 - JNE openAVX2320InnerCipherLoop - VMOVDQA ·chacha20Constants<>+0(SB), Y3 - VPADDD Y3, Y0, Y0 - VPADDD Y3, Y5, Y5 - VPADDD Y3, Y6, Y6 - VPADDD Y7, Y14, Y14 - VPADDD Y7, Y9, Y9 - VPADDD Y7, Y10, Y10 - VPADDD Y11, Y12, Y12 - VPADDD Y11, Y13, Y13 - VPADDD Y11, Y8, Y8 - VMOVDQA ·avx2IncMask<>+0(SB), Y3 - VPADDD Y15, Y4, Y4 - VPADDD Y3, Y15, Y15 - VPADDD Y15, Y1, Y1 - VPADDD Y3, Y15, Y15 - VPADDD Y15, Y2, Y2 - - // Clamp and store poly key - VPERM2I128 $0x02, Y0, Y14, Y3 - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for up to 320 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - VPERM2I128 $0x02, Y5, Y9, Y12 - VPERM2I128 $0x02, Y13, Y1, Y4 - VPERM2I128 $0x13, Y5, Y9, Y5 - VPERM2I128 $0x13, Y13, Y1, Y9 - VPERM2I128 $0x02, Y6, Y10, Y13 - VPERM2I128 $0x02, Y8, Y2, Y1 - VPERM2I128 $0x13, Y6, Y10, Y6 - VPERM2I128 $0x13, Y8, Y2, Y10 - JMP openAVX2ShortOpen - -openAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>+0(SB), Y5 - VMOVDQA 32(BP), Y9 - VMOVDQA 64(BP), Y13 - VMOVDQA 192(BP), Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 - VMOVDQA Y1, Y4 - XORQ R9, R9 - MOVQ BX, CX - ANDQ $-16, CX - TESTQ CX, CX - JE openAVX2Tail128LoopB - -openAVX2Tail128LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openAVX2Tail128LoopB: - ADDQ $0x10, R9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y1, Y1, Y1 - CMPQ R9, CX - JB openAVX2Tail128LoopA - CMPQ R9, $0xa0 - JNE openAVX2Tail128LoopB - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD 32(BP), Y9, Y9 - VPADDD 64(BP), Y13, Y13 - VPADDD Y4, Y1, Y1 - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - -openAVX2TailLoop: - CMPQ BX, $0x20 - JB openAVX2Tail - SUBQ $0x20, BX - - // Load for decryption - VPXOR (SI), Y0, Y0 - VMOVDQU Y0, (DI) - LEAQ 32(SI), SI - LEAQ 32(DI), DI - VMOVDQA Y14, Y0 - VMOVDQA Y12, Y14 - VMOVDQA Y4, Y12 - JMP openAVX2TailLoop - -openAVX2Tail: - CMPQ BX, $0x10 - VMOVDQA X0, X1 - JB openAVX2TailDone - SUBQ $0x10, BX - - // Load for decryption - VPXOR (SI), X0, X12 - VMOVDQU X12, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - VPERM2I128 $0x11, Y0, Y0, Y0 - VMOVDQA X0, X1 - -openAVX2TailDone: - VZEROUPPER - JMP openSSETail16 - -openAVX2Tail256: - VMOVDQA ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y4, Y7 - VMOVDQA Y1, Y11 - - // Compute the number of iterations that will hash data - MOVQ BX, 224(BP) - MOVQ BX, CX - SUBQ $0x80, CX - SHRQ $0x04, CX - MOVQ $0x0000000a, R9 - CMPQ CX, $0x0a - CMOVQGT R9, CX - MOVQ SI, BX - XORQ R9, R9 - -openAVX2Tail256LoopA: - ADDQ (BX), R10 - ADCQ 8(BX), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(BX), BX - -openAVX2Tail256LoopB: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 +openAVX2InternalLoop: + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 - INCQ R9 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - CMPQ R9, CX - JB openAVX2Tail256LoopA - CMPQ R9, $0x0a - JNE openAVX2Tail256LoopB - MOVQ BX, R9 - SUBQ SI, BX - MOVQ BX, CX - MOVQ 224(BP), BX - -openAVX2Tail256Hash: - ADDQ $0x10, CX - CMPQ CX, BX - JGT openAVX2Tail256HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(R9), R9 - JMP openAVX2Tail256Hash + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 + JNE openAVX2InternalLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) -openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD Y7, Y4, Y4 - VPADDD Y11, Y1, Y1 - VPERM2I128 $0x02, Y0, Y14, Y6 - VPERM2I128 $0x02, Y12, Y4, Y10 - VPERM2I128 $0x13, Y0, Y14, Y8 - VPERM2I128 $0x13, Y12, Y4, Y2 + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR (SI), Y6, Y6 - VPXOR 32(SI), Y10, Y10 - VPXOR 64(SI), Y8, Y8 - VPXOR 96(SI), Y2, Y2 - VMOVDQU Y6, (DI) - VMOVDQU Y10, 32(DI) - VMOVDQU Y8, 64(DI) - VMOVDQU Y2, 96(DI) - LEAQ 128(SI), SI - LEAQ 128(DI), DI - SUBQ $0x80, BX - JMP openAVX2TailLoop + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) -openAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) + // and here + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX + JMP openAVX2MainLoop - // Compute the number of iterations that will hash two blocks of data - MOVQ BX, 224(BP) - MOVQ BX, CX - SUBQ $0x00000100, CX - SHRQ $0x04, CX - ADDQ $0x06, CX - MOVQ $0x0000000a, R9 - CMPQ CX, $0x0a - CMOVQGT R9, CX - MOVQ SI, BX - XORQ R9, R9 +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ BX, BX + JE openSSEFinalize + CMPQ BX, $0x80 + JBE openAVX2Tail128 + CMPQ BX, $0x00000100 + JBE openAVX2Tail256 + CMPQ BX, $0x00000180 + JBE openAVX2Tail384 + JMP openAVX2Tail512 -openAVX2Tail384LoopB: - ADDQ (BX), R10 - ADCQ 8(BX), R11 +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 IMULQ R12, R15 - MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -4075,174 +797,261 @@ openAVX2Tail384LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(BX), BX -openAVX2Tail384LoopA: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - ADDQ (BX), R10 - ADCQ 8(BX), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(BX), BX - INCQ R9 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - CMPQ R9, CX - JB openAVX2Tail384LoopB - CMPQ R9, $0x0a - JNE openAVX2Tail384LoopA - MOVQ BX, R9 - SUBQ SI, BX - MOVQ BX, CX - MOVQ 224(BP), BX + // Final reduce + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 + + // Add in the "s" part of the key + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +openSSETail16: + TESTQ BX, BX + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX + JNE openSSETail16Store + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP openSSEFinalize -openAVX2Tail384Hash: - ADDQ $0x10, CX - CMPQ CX, BX - JGT openAVX2Tail384HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 +openAVX2192: + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 + +openAVX2192InnerCipherLoop: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 + JNE openAVX2192InnerCipherLoop + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + + // Clamp and store poly key + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) + + // Stream for up to 192 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ BX, $0x20 + JB openAVX2ShortTail32 + SUBQ $0x20, BX + + // Load for hashing + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4275,83 +1084,34 @@ openAVX2Tail384Hash: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 - JMP openAVX2Tail384Hash -openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPERM2I128 $0x02, Y0, Y14, Y3 - VPERM2I128 $0x02, Y12, Y4, Y7 - VPERM2I128 $0x13, Y0, Y14, Y11 - VPERM2I128 $0x13, Y12, Y4, Y15 - VPXOR (SI), Y3, Y3 - VPXOR 32(SI), Y7, Y7 - VPXOR 64(SI), Y11, Y11 - VPXOR 96(SI), Y15, Y15 - VMOVDQU Y3, (DI) - VMOVDQU Y7, 32(DI) - VMOVDQU Y11, 64(DI) - VMOVDQU Y15, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y3 - VPERM2I128 $0x02, Y13, Y1, Y7 - VPERM2I128 $0x13, Y5, Y9, Y11 - VPERM2I128 $0x13, Y13, Y1, Y15 - VPXOR 128(SI), Y3, Y3 - VPXOR 160(SI), Y7, Y7 - VPXOR 192(SI), Y11, Y11 - VPXOR 224(SI), Y15, Y15 - VMOVDQU Y3, 128(DI) - VMOVDQU Y7, 160(DI) - VMOVDQU Y11, 192(DI) - VMOVDQU Y15, 224(DI) - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - LEAQ 256(SI), SI - LEAQ 256(DI), DI - SUBQ $0x00000100, BX - JMP openAVX2TailLoop + // Load for decryption + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI -openAVX2Tail512: - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA Y0, Y7 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA Y14, Y11 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA Y12, Y15 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) - VMOVDQA Y3, 192(BP) - XORQ CX, CX - MOVQ SI, R9 + // Shift stream left + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ BX, $0x10 + VMOVDQA X0, X1 + JB openAVX2ShortDone + SUBQ $0x10, BX -openAVX2Tail512LoopB: - ADDQ (R9), R10 - ADCQ 8(R9), R11 + // Load for hashing + ADDQ (SI), R10 + ADCQ 8(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4384,252 +1144,202 @@ openAVX2Tail512LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 -openAVX2Tail512LoopA: + // Load for decryption + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +openAVX2320: + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 + +openAVX2320InnerCipherLoop: VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 + VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 + VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 + VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y10, Y10 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 - VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - ADDQ 16(R9), R10 - ADCQ 24(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 32(R9), R9 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 + VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 + VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y10, Y10 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 - VPALIGNR $0x04, Y3, Y3, Y3 - INCQ CX - CMPQ CX, $0x04 - JLT openAVX2Tail512LoopB - CMPQ CX, $0x0a - JNE openAVX2Tail512LoopA - MOVQ BX, CX - SUBQ $0x00000180, CX - ANDQ $-16, CX + DECQ R9 + JNE openAVX2320InnerCipherLoop + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 -openAVX2Tail512HashLoop: - TESTQ CX, CX - JE openAVX2Tail512HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 + // Clamp and store poly key + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) + + // Stream for up to 320 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 + JMP openAVX2ShortOpen + +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4662,884 +1372,137 @@ openAVX2Tail512HashLoop: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 - SUBQ $0x10, CX - JMP openAVX2Tail512HashLoop -openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 +openAVX2Tail128LoopB: + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 - VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 32(BP), Y11, Y11 - VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 64(BP), Y15, Y15 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPADDD 192(BP), Y3, Y3 - VMOVDQA Y15, 224(BP) - VPERM2I128 $0x02, Y0, Y14, Y15 - VPERM2I128 $0x13, Y0, Y14, Y14 - VPERM2I128 $0x02, Y12, Y4, Y0 - VPERM2I128 $0x13, Y12, Y4, Y12 - VPXOR (SI), Y15, Y15 - VPXOR 32(SI), Y0, Y0 - VPXOR 64(SI), Y14, Y14 - VPXOR 96(SI), Y12, Y12 - VMOVDQU Y15, (DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y14, 64(DI) - VMOVDQU Y12, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR 128(SI), Y0, Y0 - VPXOR 160(SI), Y14, Y14 - VPXOR 192(SI), Y12, Y12 - VPXOR 224(SI), Y4, Y4 - VMOVDQU Y0, 128(DI) - VMOVDQU Y14, 160(DI) - VMOVDQU Y12, 192(DI) - VMOVDQU Y4, 224(DI) - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - VPXOR 256(SI), Y0, Y0 - VPXOR 288(SI), Y14, Y14 - VPXOR 320(SI), Y12, Y12 - VPXOR 352(SI), Y4, Y4 - VMOVDQU Y0, 256(DI) - VMOVDQU Y14, 288(DI) - VMOVDQU Y12, 320(DI) - VMOVDQU Y4, 352(DI) - VPERM2I128 $0x02, Y7, Y11, Y0 - VPERM2I128 $0x02, 224(BP), Y3, Y14 - VPERM2I128 $0x13, Y7, Y11, Y12 - VPERM2I128 $0x13, 224(BP), Y3, Y4 - LEAQ 384(SI), SI - LEAQ 384(DI), DI - SUBQ $0x00000180, BX - JMP openAVX2TailLoop - -DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 -GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 - -DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff -DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc -DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff -DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff -GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 - -DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 -DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 -GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 - -DATA ·andMask<>+0(SB)/8, $0x00000000000000ff -DATA ·andMask<>+8(SB)/8, $0x0000000000000000 -DATA ·andMask<>+16(SB)/8, $0x000000000000ffff -DATA ·andMask<>+24(SB)/8, $0x0000000000000000 -DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+40(SB)/8, $0x0000000000000000 -DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+56(SB)/8, $0x0000000000000000 -DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+72(SB)/8, $0x0000000000000000 -DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+88(SB)/8, $0x0000000000000000 -DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+104(SB)/8, $0x0000000000000000 -DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+120(SB)/8, $0x0000000000000000 -DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+136(SB)/8, $0x00000000000000ff -DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+152(SB)/8, $0x000000000000ffff -DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff -GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 -DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 -DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 -DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 -DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 -GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 +openAVX2TailLoop: + CMPQ BX, $0x20 + JB openAVX2Tail + SUBQ $0x20, BX -DATA ·rol16<>+0(SB)/8, $0x0504070601000302 -DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a -DATA ·rol16<>+16(SB)/8, $0x0504070601000302 -DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a -GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + // Load for decryption + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + JMP openAVX2TailLoop -DATA ·rol8<>+0(SB)/8, $0x0605040702010003 -DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b -DATA ·rol8<>+16(SB)/8, $0x0605040702010003 -DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b -GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 +openAVX2Tail: + CMPQ BX, $0x10 + VMOVDQA X0, X1 + JB openAVX2TailDone + SUBQ $0x10, BX -DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 -DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 -DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 -DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 -GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + // Load for decryption + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 -// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) -// Requires: AVX, AVX2, BMI2, CMOV, SSE2 -TEXT ·chacha20Poly1305Seal(SB), $288-96 - MOVQ SP, BP - ADDQ $0x20, BP - ANDQ $-32, BP - MOVQ dst_base+0(FP), DI - MOVQ key_base+24(FP), R8 - MOVQ src_base+48(FP), SI - MOVQ src_len+56(FP), BX - MOVQ ad_base+72(FP), CX - CMPB ·useAVX2+0(SB), $0x01 - JE chacha20Poly1305Seal_AVX2 +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 - // Special optimization, for very short buffers - CMPQ BX, $0x80 - JBE sealSSE128 - - // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - - // Store state on stack for future use - MOVO X3, 32(BP) - MOVO X6, 48(BP) - - // Load state, increment counter blocks - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - MOVQ $0x0000000a, R9 - -sealSSEIntroLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ R9 - JNE sealSSEIntroLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVO X0, (BP) - MOVO X3, 16(BP) - - // Hash AAD - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, (DI) - MOVOU X4, 16(DI) - MOVOU X7, 32(DI) - MOVOU X10, 48(DI) - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 64(DI) - MOVOU X5, 80(DI) - MOVOU X8, 96(DI) - MOVOU X11, 112(DI) - MOVQ $0x00000080, CX - SUBQ $0x80, BX - LEAQ 128(SI), SI - MOVO X12, X1 - MOVO X13, X4 - MOVO X14, X7 - MOVO X15, X10 - CMPQ BX, $0x40 - JBE sealSSE128SealHash - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X12 - PXOR X3, X13 - PXOR X6, X14 - PXOR X9, X15 - MOVOU X12, 128(DI) - MOVOU X13, 144(DI) - MOVOU X14, 160(DI) - MOVOU X15, 176(DI) - ADDQ $0x40, CX - SUBQ $0x40, BX - LEAQ 64(SI), SI - MOVQ $0x00000002, CX - MOVQ $0x00000008, R9 - CMPQ BX, $0x40 - JBE sealSSETail64 - CMPQ BX, $0x80 - JBE sealSSETail128 - CMPQ BX, $0xc0 - JBE sealSSETail192 - -sealSSEMainLoop: - // Load state, increment counter blocks - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - -sealSSEInnerLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail256: + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + + // Compute the number of iterations that will hash data + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + +openAVX2Tail256LoopA: + ADDQ (BX), R10 + ADCQ 8(BX), R11 ADCQ $0x01, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - LEAQ 16(DI), DI - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 @@ -5555,112 +1518,117 @@ sealSSEInnerLoop: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ R9 - JGE sealSSEInnerLoop - ADDQ (DI), R10 - ADCQ 8(DI), R11 + LEAQ 16(BX), BX + +openAVX2Tail256LoopB: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX + +openAVX2Tail256Hash: + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -5672,143 +1640,94 @@ sealSSEInnerLoop: SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - DECQ CX - JG sealSSEInnerLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - MOVO X15, 64(BP) - - // Load - xor - store - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVO 64(BP), X15 - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVOU 128(SI), X0 - MOVOU 144(SI), X3 - MOVOU 160(SI), X6 - MOVOU 176(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 128(DI) - MOVOU X5, 144(DI) - MOVOU X8, 160(DI) - MOVOU X11, 176(DI) - ADDQ $0xc0, SI - MOVQ $0x000000c0, CX - SUBQ $0xc0, BX - MOVO X12, X1 - MOVO X13, X4 - MOVO X14, X7 - MOVO X15, X10 - CMPQ BX, $0x40 - JBE sealSSE128SealHash - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X12 - PXOR X3, X13 - PXOR X6, X14 - PXOR X9, X15 - MOVOU X12, 192(DI) - MOVOU X13, 208(DI) - MOVOU X14, 224(DI) - MOVOU X15, 240(DI) - LEAQ 64(SI), SI - SUBQ $0x40, BX - MOVQ $0x00000006, CX - MOVQ $0x00000004, R9 - CMPQ BX, $0xc0 - JG sealSSEMainLoop - MOVQ BX, CX - TESTQ BX, BX - JE sealSSE128SealHash - MOVQ $0x00000006, CX - CMPQ BX, $0x40 - JBE sealSSETail64 - CMPQ BX, $0x80 - JBE sealSSETail128 - JMP sealSSETail192 - -sealSSETail64: - MOVO ·chacha20Constants<>+0(SB), X1 - MOVO 32(BP), X4 - MOVO 48(BP), X7 - MOVO 128(BP), X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 80(BP) - -sealSSETail64LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + + // Compute the number of iterations that will hash two blocks of data + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + +openAVX2Tail384LoopB: + ADDQ (BX), R10 + ADCQ 8(BX), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -5825,175 +1744,190 @@ sealSSETail64LoopA: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI + LEAQ 16(BX), BX + +openAVX2Tail384LoopA: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX -sealSSETail64LoopB: - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x0c, X13 - PSRLL $0x14, X4 - PXOR X13, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x07, X13 - PSRLL $0x19, X4 - PXOR X13, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x0c, X13 - PSRLL $0x14, X4 - PXOR X13, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x07, X13 - PSRLL $0x19, X4 - PXOR X13, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - DECQ CX - JG sealSSETail64LoopA - DECQ R9 - JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X4 - PADDL 48(BP), X7 - PADDL 80(BP), X10 - JMP sealSSE128Seal - -sealSSETail128: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - -sealSSETail128LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail384Hash: + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6010,430 +1944,99 @@ sealSSETail128LoopA: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash -sealSSETail128LoopB: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - DECQ CX - JG sealSSETail128LoopA - DECQ R9 - JGE sealSSETail128LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 80(BP), X9 - PADDL 96(BP), X10 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X0 - PXOR X13, X3 - PXOR X14, X6 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVQ $0x00000040, CX - LEAQ 64(SI), SI - SUBQ $0x40, BX - JMP sealSSE128SealHash - -sealSSETail192: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X11, 112(BP) - -sealSSETail192LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX + JMP openAVX2TailLoop -sealSSETail192LoopB: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 + +openAVX2Tail512LoopB: + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6450,465 +2053,268 @@ sealSSETail192LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ CX - JG sealSSETail192LoopA - DECQ R9 - JGE sealSSETail192LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 32(BP), X5 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 48(BP), X8 - PADDL 80(BP), X9 - PADDL 96(BP), X10 - PADDL 112(BP), X11 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X0 - PXOR X13, X3 - PXOR X14, X6 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVOU 64(SI), X12 - MOVOU 80(SI), X13 - MOVOU 96(SI), X14 - MOVOU 112(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVO X2, X1 - MOVO X5, X4 - MOVO X8, X7 - MOVO X11, X10 - MOVQ $0x00000080, CX - LEAQ 128(SI), SI - SUBQ $0x80, BX - JMP sealSSE128SealHash - -sealSSE128: - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X3, X13 - MOVO X6, X14 - MOVO X10, X15 - MOVQ $0x0000000a, R9 - -sealSSE128InnerCipherLoop: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ R9 - JNE sealSSE128InnerCipherLoop - - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL X13, X3 - PADDL X13, X4 - PADDL X13, X5 - PADDL X14, X7 - PADDL X14, X8 - PADDL X15, X10 - PADDL ·sseIncMask<>+0(SB), X15 - PADDL X15, X11 - PAND ·polyClampMask<>+0(SB), X0 - MOVOU X0, (BP) - MOVOU X3, 16(BP) + LEAQ 16(R9), R9 - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - XORQ CX, CX +openAVX2Tail512LoopA: + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 + JLT openAVX2Tail512LoopB + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX -sealSSE128SealHash: - CMPQ CX, $0x10 - JB sealSSE128Seal - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail512HashLoop: + TESTQ CX, CX + JE openAVX2Tail512HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6925,238 +2331,162 @@ sealSSE128SealHash: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 + LEAQ 16(R9), R9 SUBQ $0x10, CX - ADDQ $0x10, DI - JMP sealSSE128SealHash - -sealSSE128Seal: - CMPQ BX, $0x10 - JB sealSSETail - SUBQ $0x10, BX - - // Load for decryption - MOVOU (SI), X12 - PXOR X12, X1 - MOVOU X1, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - - // Extract for hashing - MOVQ X1, R13 - PSRLDQ $0x08, X1 - MOVQ X1, R14 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Shift the stream "left" - MOVO X4, X1 - MOVO X7, X4 - MOVO X10, X7 - MOVO X2, X10 - MOVO X5, X2 - MOVO X8, X5 - MOVO X11, X8 - JMP sealSSE128Seal + JMP openAVX2Tail512HashLoop -sealSSETail: - TESTQ BX, BX - JE sealSSEFinalize +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop - // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ BX, R9 - SHLQ $0x04, R9 - LEAQ ·andMask<>+0(SB), R13 - MOVQ BX, CX - LEAQ -1(SI)(BX*1), SI - XORQ R15, R15 - XORQ R8, R8 - XORQ AX, AX +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 -sealSSETailLoadLoop: - SHLQ $0x08, R15, R8 - SHLQ $0x08, R15 - MOVB (SI), AX - XORQ AX, R15 - LEAQ -1(SI), SI - DECQ CX - JNE sealSSETailLoadLoop - MOVQ R15, 64(BP) - MOVQ R8, 72(BP) - PXOR 64(BP), X1 - MOVOU X1, (DI) - MOVOU -16(R13)(R9*1), X12 - PAND X12, X1 - MOVQ X1, R13 - PSRLDQ $0x08, X1 - MOVQ X1, R14 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ BX, DI +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 -sealSSEFinalize: - // Hash in the buffer lengths - ADDQ ad_len+80(FP), R10 - ADCQ src_len+56(FP), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 - // Final reduce - MOVQ R10, R13 - MOVQ R11, R14 - MOVQ R12, R15 - SUBQ $-5, R10 - SBBQ $-1, R11 - SBBQ $0x03, R12 - CMOVQCS R13, R10 - CMOVQCS R14, R11 - CMOVQCS R15, R12 +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 - // Add in the "s" part of the key - ADDQ 16(BP), R10 - ADCQ 24(BP), R11 +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 - // Finally store the tag at the end of the message - MOVQ R10, (DI) - MOVQ R11, 8(DI) - RET +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 -chacha20Poly1305Seal_AVX2: +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 + MOVQ SP, BP + ADDQ $0x20, BP + ANDQ $-32, BP + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX VZEROUPPER - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x70 - BYTE $0x10 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x20 - BYTE $0xc4 - BYTE $0xc2 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x30 - VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VBROADCASTI128 16(R8), Y14 + VBROADCASTI128 32(R8), Y12 + VBROADCASTI128 48(R8), Y4 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers CMPQ BX, $0x000000c0 @@ -8170,6 +3500,144 @@ sealAVX2InternalLoopStart: JBE sealAVX2Tail384 JMP sealAVX2Tail512 +sealSSETail: + TESTQ BX, BX + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX + JNE sealSSETailLoadLoop + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + + // Final reduce + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 + + // Add in the "s" part of the key + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 + + // Finally store the tag at the end of the message + MOVQ R10, (DI) + MOVQ R11, 8(DI) + RET + seal192AVX2: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 diff --git a/vendor/golang.org/x/net/http2/server_wrap.go b/vendor/golang.org/x/net/http2/server_wrap.go index a7a09551c4..737f1f0573 100644 --- a/vendor/golang.org/x/net/http2/server_wrap.go +++ b/vendor/golang.org/x/net/http2/server_wrap.go @@ -10,9 +10,11 @@ package http2 import ( "context" + "crypto/tls" "errors" "net" "net/http" + "slices" "sync" "time" ) @@ -44,6 +46,20 @@ func configureServer(s *http.Server, conf *Server) error { h2.IdleTimeout = h1.ReadTimeout } } + + // Register h2 and http/1.1 ALPN protocols on s.TLSConfig, matching + // the pre-wrapping implementation in server.go, so that TLS listeners + // built from s.TLSConfig still negotiate HTTP/2. + if s.TLSConfig == nil { + s.TLSConfig = new(tls.Config) + } + if !slices.Contains(s.TLSConfig.NextProtos, NextProtoTLS) { + s.TLSConfig.NextProtos = append(s.TLSConfig.NextProtos, NextProtoTLS) + } + if !slices.Contains(s.TLSConfig.NextProtos, "http/1.1") { + s.TLSConfig.NextProtos = append(s.TLSConfig.NextProtos, "http/1.1") + } + conf.state = &serverInternalState{ s1: s, } diff --git a/vendor/golang.org/x/net/http2/transport_wrap.go b/vendor/golang.org/x/net/http2/transport_wrap.go index d25d99bdbb..eab2e6b073 100644 --- a/vendor/golang.org/x/net/http2/transport_wrap.go +++ b/vendor/golang.org/x/net/http2/transport_wrap.go @@ -22,8 +22,8 @@ import ( ) func configureTransport(t1 *http.Transport) error { - // ConfigureTransport is a no-op: The http.Transport already supports HTTP/2. - return nil + _, err := configureTransports(t1) + return err } func configureTransports(t1 *http.Transport) (*Transport, error) { @@ -31,6 +31,17 @@ func configureTransports(t1 *http.Transport) (*Transport, error) { // linked to the http.Transport's. tr2 := &Transport{} tr2.configure(t1) + // Enable HTTP/2 on the transport, as the pre-wrapping implementation did: + // net/http does not auto-enable it for a transport with a custom + // TLSClientConfig or dialer. + if t1.TLSClientConfig == nil { + t1.TLSClientConfig = &tls.Config{} + } + if t1.Protocols == nil { + t1.Protocols = new(http.Protocols) + t1.Protocols.SetHTTP1(true) + } + t1.Protocols.SetHTTP2(true) return tr2, nil } diff --git a/vendor/golang.org/x/sync/errgroup/errgroup.go b/vendor/golang.org/x/sync/errgroup/errgroup.go index f69fd75468..c261a8ebbd 100644 --- a/vendor/golang.org/x/sync/errgroup/errgroup.go +++ b/vendor/golang.org/x/sync/errgroup/errgroup.go @@ -109,7 +109,7 @@ func (g *Group) TryGo(f func() error) bool { if g.sem != nil { select { case g.sem <- token{}: - // Note: this allows barging iff channels in general allow barging. + // Note: this allows barging if and only if channels in general allow barging. default: return false } diff --git a/vendor/golang.org/x/sync/semaphore/semaphore.go b/vendor/golang.org/x/sync/semaphore/semaphore.go index b618162aab..040c5bc509 100644 --- a/vendor/golang.org/x/sync/semaphore/semaphore.go +++ b/vendor/golang.org/x/sync/semaphore/semaphore.go @@ -83,7 +83,7 @@ func (s *Weighted) Acquire(ctx context.Context, n int64) error { default: isFront := s.waiters.Front() == elem s.waiters.Remove(elem) - // If we're at the front and there're extra tokens left, notify other waiters. + // If we're at the front and there are extra tokens left, notify other waiters. if isFront && s.size > s.cur { s.notifyWaiters() } @@ -139,15 +139,15 @@ func (s *Weighted) notifyWaiters() { w := next.Value.(waiter) if s.size-s.cur < w.n { - // Not enough tokens for the next waiter. We could keep going (to try to + // Not enough tokens for the next waiter. We could keep going (to try to // find a waiter with a smaller request), but under load that could cause // starvation for large requests; instead, we leave all remaining waiters // blocked. // // Consider a semaphore used as a read-write lock, with N tokens, N - // readers, and one writer. Each reader can Acquire(1) to obtain a read - // lock. The writer can Acquire(N) to obtain a write lock, excluding all - // of the readers. If we allow the readers to jump ahead in the queue, + // readers, and one writer. Each reader can Acquire(1) to obtain a read + // lock. The writer can Acquire(N) to obtain a write lock, excluding all + // of the readers. If we allow the readers to jump ahead in the queue, // the writer will starve — there is always one token available for every // reader. break diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go index d11d5b96a4..526a0d5f43 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go @@ -6397,3 +6397,79 @@ const ( MPOL_PREFERRED_MANY = 0x5 MPOL_WEIGHTED_INTERLEAVE = 0x6 ) + +const ( + GPIO_V2_GET_LINEINFO_IOCTL = 0xc100b405 + GPIO_V2_GET_LINE_IOCTL = 0xc250b407 + GPIO_V2_LINE_GET_VALUES_IOCTL = 0xc010b40e + GPIO_V2_LINE_SET_VALUES_IOCTL = 0xc010b40f + GPIO_V2_GET_LINEINFO_WATCH_IOCTL = 0xc100b406 + GPIO_GET_LINEINFO_UNWATCH_IOCTL = 0xc004b40c +) +const ( + GPIO_V2_LINE_ATTR_ID_FLAGS = 0x1 + GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES = 0x2 + GPIO_V2_LINE_ATTR_ID_DEBOUNCE = 0x3 + GPIO_V2_LINE_CHANGED_REQUESTED = 0x1 + GPIO_V2_LINE_CHANGED_RELEASED = 0x2 + GPIO_V2_LINE_CHANGED_CONFIG = 0x3 + GPIO_V2_LINE_EVENT_RISING_EDGE = 0x1 + GPIO_V2_LINE_EVENT_FALLING_EDGE = 0x2 +) + +type GPIOChipInfo struct { + Name [32]byte + Label [32]byte + Lines uint32 +} +type GPIOV2LineValues struct { + Bits uint64 + Mask uint64 +} +type GPIOV2LineAttribute struct { + Id uint32 + _ uint32 + Flags uint64 +} +type GPIOV2LineConfigAttribute struct { + Attr GPIOV2LineAttribute + Mask uint64 +} +type GPIOV2LineConfig struct { + Flags uint64 + Num_attrs uint32 + _ [5]uint32 + Attrs [10]GPIOV2LineConfigAttribute +} +type GPIOV2LineRequest struct { + Offsets [64]uint32 + Consumer [32]byte + Config GPIOV2LineConfig + Num_lines uint32 + Event_buffer_size uint32 + _ [5]uint32 + Fd int32 +} +type GPIOV2LineInfo struct { + Name [32]byte + Consumer [32]byte + Offset uint32 + Num_attrs uint32 + Flags uint64 + Attrs [10]GPIOV2LineAttribute + _ [4]uint32 +} +type GPIOV2LineInfoChanged struct { + Info GPIOV2LineInfo + Timestamp_ns uint64 + Event_type uint32 + _ [5]uint32 +} +type GPIOV2LineEvent struct { + Timestamp_ns uint64 + Id uint32 + Offset uint32 + Seqno uint32 + Line_seqno uint32 + _ [6]uint32 +} diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go index 97ef790deb..aede1de7f2 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go @@ -711,3 +711,7 @@ type SysvShmDesc struct { _ uint32 _ uint32 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go index 90b50da680..bb3bc4dc2c 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go @@ -725,3 +725,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go index acda136851..1fdf4c5175 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go @@ -705,3 +705,7 @@ type SysvShmDesc struct { _ uint32 _ uint32 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go index ef7a99e1f9..063e6f0b41 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go @@ -704,3 +704,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go index 966063dfc1..9cf836c708 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go @@ -705,3 +705,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go index dc53b20b74..1d222fcb31 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go @@ -710,3 +710,7 @@ type SysvShmDesc struct { Ctime_high uint16 _ uint16 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go index 9ad0aa8c31..912cc4ab63 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go @@ -707,3 +707,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go index 29d55493d5..1e358ef34f 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go @@ -707,3 +707,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go index a4d9e15848..df59f32f5e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go @@ -710,3 +710,7 @@ type SysvShmDesc struct { Ctime_high uint16 _ uint16 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go index f8a2977716..29355aa0bf 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go @@ -718,3 +718,7 @@ type SysvShmDesc struct { _ uint32 _ [4]byte } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go index 4158d6c4ee..c6083a15d7 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go @@ -713,3 +713,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go index 1035af49f7..6321cc7626 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go @@ -713,3 +713,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go index 2297125d3c..b44f402feb 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go @@ -792,3 +792,7 @@ const ( RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE = 0x6 RISCV_HWPROBE_WHICH_CPUS = 0x1 ) + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go index 8481e9bd98..b22c795a64 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go @@ -727,3 +727,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go index a6828a0310..0b18075b53 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go @@ -708,3 +708,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/google.golang.org/api/internal/version.go b/vendor/google.golang.org/api/internal/version.go index f293e1abbe..73cc73c9d1 100644 --- a/vendor/google.golang.org/api/internal/version.go +++ b/vendor/google.golang.org/api/internal/version.go @@ -5,4 +5,4 @@ package internal // Version is the current tagged release of the library. -const Version = "0.282.0" +const Version = "0.285.0" diff --git a/vendor/modules.txt b/vendor/modules.txt index 0e24910c12..7ea7807ecb 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -582,7 +582,7 @@ go.yaml.in/yaml/v2 # go.yaml.in/yaml/v3 v3.0.4 ## explicit; go 1.16 go.yaml.in/yaml/v3 -# golang.org/x/crypto v0.51.0 +# golang.org/x/crypto v0.53.0 ## explicit; go 1.25.0 golang.org/x/crypto/chacha20 golang.org/x/crypto/chacha20poly1305 @@ -599,7 +599,7 @@ golang.org/x/exp/maps golang.org/x/mod/internal/lazyregexp golang.org/x/mod/module golang.org/x/mod/semver -# golang.org/x/net v0.55.0 +# golang.org/x/net v0.56.0 ## explicit; go 1.25.0 golang.org/x/net/context/ctxhttp golang.org/x/net/http/httpguts @@ -622,21 +622,21 @@ golang.org/x/oauth2/google/internal/stsexchange golang.org/x/oauth2/internal golang.org/x/oauth2/jws golang.org/x/oauth2/jwt -# golang.org/x/sync v0.20.0 +# golang.org/x/sync v0.21.0 ## explicit; go 1.25.0 golang.org/x/sync/errgroup golang.org/x/sync/semaphore -# golang.org/x/sys v0.45.0 +# golang.org/x/sys v0.46.0 ## explicit; go 1.25.0 golang.org/x/sys/cpu golang.org/x/sys/plan9 golang.org/x/sys/unix golang.org/x/sys/windows golang.org/x/sys/windows/registry -# golang.org/x/term v0.43.0 +# golang.org/x/term v0.44.0 ## explicit; go 1.25.0 golang.org/x/term -# golang.org/x/text v0.37.0 +# golang.org/x/text v0.38.0 ## explicit; go 1.25.0 golang.org/x/text/cases golang.org/x/text/encoding @@ -685,7 +685,7 @@ golang.org/x/tools/internal/versions # gomodules.xyz/jsonpatch/v2 v2.4.0 ## explicit; go 1.20 gomodules.xyz/jsonpatch/v2 -# google.golang.org/api v0.282.0 +# google.golang.org/api v0.285.0 ## explicit; go 1.25.8 google.golang.org/api/googleapi google.golang.org/api/googleapi/transport @@ -718,7 +718,7 @@ google.golang.org/genproto/googleapis/api/httpbody google.golang.org/genproto/googleapis/api/label google.golang.org/genproto/googleapis/api/metric google.golang.org/genproto/googleapis/api/monitoredres -# google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa +# google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad ## explicit; go 1.25.0 google.golang.org/genproto/googleapis/rpc/code google.golang.org/genproto/googleapis/rpc/context/attribute_context