Warmcat homepage andy@warmcat.com
libwebsockets
{"schema":"libjg2-1", "vpath":"/git/", "avatar":"/git/avatar/", "alang":"", "gen_ut":1752657393, "reponame":"openssl", "desc":"OpenSSL", "owner": { "name": "Andy Green", "email": "andy@warmcat.com", "md5": "c50933ca2aa61e0fe2c43d46bb6b59cb" },"url":"https://warmcat.com/repo/openssl", "f":3, "items": [ {"schema":"libjg2-1", "cid":"1a1efb67bfd8718ad9d9035a6a2c6385", "commit": {"type":"commit", "time": 1489427314, "time_ofs": 60, "oid_tree": { "oid": "5728bd27b125f93423340293126f418f8daabe2a", "alias": []}, "oid":{ "oid": "c2b935904a3887f99c452cc120f7e8f2cc7ab33f", "alias": []}, "msg": "poly1305/asm/poly1305-x86_64.pl: add poly1305_blocks_vpmadd52_4x.", "sig_commit": { "git_time": { "time": 1489427314, "offset": 60 }, "name": "Andy Polyakov", "email": "appro@openssl.org", "md5": "50bd64fa2a792cbbf679fa16213a3b2a" }, "sig_author": { "git_time": { "time": 1489329436, "offset": 60 }, "name": "Andy Polyakov", "email": "appro@openssl.org", "md5": "50bd64fa2a792cbbf679fa16213a3b2a" }}, "body": "poly1305/asm/poly1305-x86_64.pl: add poly1305_blocks_vpmadd52_4x.\n\nAs hinted by its name new subroutine processes 4 input blocks in\nparallel. It still operates on 256-bit registers and is just\nanother step toward full-blown AVX512IFMA procedure.\n\nReviewed-by: Rich Salz \u003crsalz@openssl.org\u003e\n" , "diff": "diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl\nindex a397019..71a9efb 100755\n--- a/crypto/poly1305/asm/poly1305-x86_64.pl\n+++ b/crypto/poly1305/asm/poly1305-x86_64.pl\n@@ -2716,6 +2716,17 @@ if ($avx\u003e3) {\n # path longer. In other words, even though base 2^44 reduction might\n # look less elegant, overall critical path is actually shorter...\n \n+########################################################################\n+# Layout of opaque area is following.\n+#\n+#\tunsigned __int64 h[3];\t\t# current hash value base 2^44\n+#\tunsigned __int64 s[2];\t\t# key value*20 base 2^44\n+#\tunsigned __int64 r[3];\t\t# key value base 2^44\n+#\tstruct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];\n+#\t\t\t\t\t# r^n positions reflect\n+#\t\t\t\t\t# placement in register, not\n+#\t\t\t\t\t# memory, R[3] is R[1]*20\n+\n $code.\u003d\u003c\u003c___;\n .type\tpoly1305_init_base2_44,\u005c@function,3\n .align\t32\n@@ -2748,6 +2759,7 @@ poly1305_init_base2_44:\n \tshl\t\u005c$2,%rcx\t\t# magic \u003c\u003c2\n \tmov\t%rax,24($ctx)\t\t# s1\n \tmov\t%rcx,32($ctx)\t\t# s2\n+\tmovq\t\u005c$-1,64($ctx)\t\t# write impossible value\n ___\n $code.\u003d\u003c\u003c___\tif ($flavour !~ /elf32/);\n \tmov\t%r10,0(%rdx)\n@@ -2774,11 +2786,29 @@ poly1305_blocks_vpmadd52:\n \tshr\t\u005c$4,$len\n \tjz\t.Lno_data_vpmadd52\t\t# too short\n \n+\tshl\t\u005c$40,$padbit\n+\tmov\t64($ctx),%r8\t\t\t# peek on power of the key\n+\n+\t# if powers of the key are not calculated yet, process up to 3\n+\t# blocks with this single-block subroutine, otherwise ensure that\n+\t# length is divisible by 2 blocks and pass the rest down to next\n+\t# subroutine...\n+\n+\tmov\t\u005c$3,%rax\n+\tmov\t\u005c$1,%r10\n+\tcmp\t\u005c$4,$len\t\t\t# is input long\n+\tcmovae\t%r10,%rax\n+\ttest\t%r8,%r8\t\t\t\t# is power value impossible?\n+\tcmovns\t%r10,%rax\n+\n+\tand\t$len,%rax\t\t\t# is input of favourable length?\n+\tjz\t.Lblocks_vpmadd52_4x\n+\n+\tsub\t\t%rax,$len\n \tmov\t\t\u005c$7,%r10d\n \tmov\t\t\u005c$1,%r11d\n \tkmovw\t\t%r10d,%k7\n \tlea\t\t.L2_44_inp_permd(%rip),%r10\n-\tshl\t\t\u005c$40,$padbit\n \tkmovw\t\t%r11d,%k1\n \n \tvmovq\t\t$padbit,%x#$PAD\n@@ -2849,16 +2879,451 @@ poly1305_blocks_vpmadd52:\n \n \tvpaddq\t\t$T0,$Dlo,$Dlo\n \n-\tdec\t\t$len\t\t\t# len-\u003d16\n+\tdec\t\t%rax\t\t\t# len-\u003d16\n \tjnz\t\t.Loop_vpmadd52\n \n \tvmovdqu64\t$Dlo,0($ctx){%k7}\t# store hash value\n \n+\ttest\t\t$len,$len\n+\tjnz\t\t.Lblocks_vpmadd52_4x\n+\n .Lno_data_vpmadd52:\n \tret\n .size\tpoly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52\n ___\n }\n+{\n+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) \u003d map(\u0022%ymm$_\u0022,(0..5,16,17));\n+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) \u003d map(\u0022%ymm$_\u0022,(18..23));\n+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) \u003d map(\u0022%ymm$_\u0022,(24..31));\n+\n+$code.\u003d\u003c\u003c___;\n+.type\tpoly1305_blocks_vpmadd52_4x,\u005c@function,4\n+.align\t32\n+poly1305_blocks_vpmadd52_4x:\n+\tshr\t\u005c$4,$len\n+\tjz\t.Lno_data_vpmadd52_4x\t\t# too short\n+\n+\tshl\t\u005c$40,$padbit\n+\tmov\t64($ctx),%r8\t\t\t# peek on power of the key\n+\n+.Lblocks_vpmadd52_4x:\n+\tvpbroadcastq\t$padbit,$PAD\n+\n+\tvmovdqa64\t.Lx_mask44(%rip),$mask44\n+\tmov\t\t\u005c$5,%eax\n+\tvmovdqa64\t.Lx_mask42(%rip),$mask42\n+\tkmovw\t\t%eax,%k1\t\t# used in 2x path\n+\n+\ttest\t\t%r8,%r8\t\t\t# is power value impossible?\n+\tjs\t\t.Linit_vpmadd52\t\t# if it is, then init R[4]\n+\n+\tvmovq\t\t0($ctx),%x#$H0\t\t# load current hash value\n+\tvmovq\t\t8($ctx),%x#$H1\n+\tvmovq\t\t16($ctx),%x#$H2\n+\n+\ttest\t\t\u005c$3,$len\t\t# is length 4*n+2?\n+\tjnz\t\t.Lblocks_vpmadd52_2x_do\n+\n+.Lblocks_vpmadd52_4x_do:\n+\tvpbroadcastq\t64($ctx),$R0\t\t# load 4th power of the key\n+\tvpbroadcastq\t96($ctx),$R1\n+\tvpbroadcastq\t128($ctx),$R2\n+\tvpbroadcastq\t160($ctx),$S1\n+\n+.Lblocks_vpmadd52_4x_key_loaded:\n+\tvpsllq\t\t\u005c$2,$R2,$S2\t\t# S2 \u003d R2*5*4\n+\tvpaddq\t\t$R2,$S2,$S2\n+\tvpsllq\t\t\u005c$2,$S2,$S2\n+\n+\tvmovdqu64\t16*0($inp),$T2\t\t# load data\n+\tvmovdqu64\t16*2($inp),$T3\n+\tlea\t\t16*4($inp),$inp\n+\n+\tvpunpcklqdq\t$T3,$T2,$T1\t\t# transpose data\n+\tvpunpckhqdq\t$T3,$T2,$T3\n+\n+\t# at this point 64-bit lanes are ordered as 3-1-2-0\n+\n+\tvpsrlq\t\t\u005c$24,$T3,$T2\t\t# splat the data\n+\tvporq\t\t$PAD,$T2,$T2\n+\t vpaddq\t\t$T2,$H2,$H2\t\t# accumulate input\n+\tvpandq\t\t$mask44,$T1,$T0\n+\tvpsrlq\t\t\u005c$44,$T1,$T1\n+\tvpsllq\t\t\u005c$20,$T3,$T3\n+\tvporq\t\t$T3,$T1,$T1\n+\tvpandq\t\t$mask44,$T1,$T1\n+\n+\tsub\t\t\u005c$4,$len\n+\tjz\t\t.Ltail_vpmadd52_4x\n+\tjmp\t\t.Loop_vpmadd52_4x\n+\tud2\n+\n+.align\t32\n+.Linit_vpmadd52:\n+\tvmovq\t\t24($ctx),%x#$S1\t\t# load key\n+\tvmovq\t\t56($ctx),%x#$H2\n+\tvmovq\t\t32($ctx),%x#$S2\n+\tvmovq\t\t40($ctx),%x#$R0\n+\tvmovq\t\t48($ctx),%x#$R1\n+\n+\tvmovdqa\t\t$R0,$H0\n+\tvmovdqa\t\t$R1,$H1\n+\tvmovdqa\t\t$H2,$R2\n+\n+\tmov\t\t\u005c$2,%eax\n+\n+.Lmul_init_vpmadd52:\n+\tvpxorq\t\t$D0lo,$D0lo,$D0lo\n+\tvpmadd52luq\t$H2,$S1,$D0lo\n+\tvpxorq\t\t$D0hi,$D0hi,$D0hi\n+\tvpmadd52huq\t$H2,$S1,$D0hi\n+\tvpxorq\t\t$D1lo,$D1lo,$D1lo\n+\tvpmadd52luq\t$H2,$S2,$D1lo\n+\tvpxorq\t\t$D1hi,$D1hi,$D1hi\n+\tvpmadd52huq\t$H2,$S2,$D1hi\n+\tvpxorq\t\t$D2lo,$D2lo,$D2lo\n+\tvpmadd52luq\t$H2,$R0,$D2lo\n+\tvpxorq\t\t$D2hi,$D2hi,$D2hi\n+\tvpmadd52huq\t$H2,$R0,$D2hi\n+\n+\tvpmadd52luq\t$H0,$R0,$D0lo\n+\tvpmadd52huq\t$H0,$R0,$D0hi\n+\tvpmadd52luq\t$H0,$R1,$D1lo\n+\tvpmadd52huq\t$H0,$R1,$D1hi\n+\tvpmadd52luq\t$H0,$R2,$D2lo\n+\tvpmadd52huq\t$H0,$R2,$D2hi\n+\n+\tvpmadd52luq\t$H1,$S2,$D0lo\n+\tvpmadd52huq\t$H1,$S2,$D0hi\n+\tvpmadd52luq\t$H1,$R0,$D1lo\n+\tvpmadd52huq\t$H1,$R0,$D1hi\n+\tvpmadd52luq\t$H1,$R1,$D2lo\n+\tvpmadd52huq\t$H1,$R1,$D2hi\n+\n+\t################################################################\n+\t# partial reduction\n+\tvpsrlq\t\t\u005c$44,$D0lo,$tmp\n+\tvpsllq\t\t\u005c$8,$D0hi,$D0hi\n+\tvpandq\t\t$mask44,$D0lo,$H0\n+\tvpaddq\t\t$tmp,$D0hi,$D0hi\n+\n+\tvpaddq\t\t$D0hi,$D1lo,$D1lo\n+\n+\tvpsrlq\t\t\u005c$44,$D1lo,$tmp\n+\tvpsllq\t\t\u005c$8,$D1hi,$D1hi\n+\tvpandq\t\t$mask44,$D1lo,$H1\n+\tvpaddq\t\t$tmp,$D1hi,$D1hi\n+\n+\tvpaddq\t\t$D1hi,$D2lo,$D2lo\n+\n+\tvpsrlq\t\t\u005c$42,$D2lo,$tmp\n+\tvpsllq\t\t\u005c$10,$D2hi,$D2hi\n+\tvpandq\t\t$mask42,$D2lo,$H2\n+\tvpaddq\t\t$tmp,$D2hi,$D2hi\n+\n+\tvpaddq\t\t$D2hi,$H0,$H0\n+\tvpsllq\t\t\u005c$2,$D2hi,$D2hi\n+\n+\tvpaddq\t\t$D2hi,$H0,$H0\n+\n+\tvpsrlq\t\t\u005c$44,$H0,$tmp\t\t# additional step\n+\tvpandq\t\t$mask44,$H0,$H0\n+\n+\tvpaddq\t\t$tmp,$H1,$H1\n+\n+\tdec\t\t%eax\n+\tjz\t\t.Ldone_init_vpmadd52\n+\n+\tvpunpcklqdq\t$R1,$H1,$R1\t\t# 1,2\n+\tvpbroadcastq\t%x#$H1,%x#$H1\t\t# 2,2\n+\tvpunpcklqdq\t$R2,$H2,$R2\n+\tvpbroadcastq\t%x#$H2,%x#$H2\n+\tvpunpcklqdq\t$R0,$H0,$R0\n+\tvpbroadcastq\t%x#$H0,%x#$H0\n+\n+\tvpsllq\t\t\u005c$2,$R1,$S1\t\t# S1 \u003d R1*5*4\n+\tvpsllq\t\t\u005c$2,$R2,$S2\t\t# S2 \u003d R2*5*4\n+\tvpaddq\t\t$R1,$S1,$S1\n+\tvpaddq\t\t$R2,$S2,$S2\n+\tvpsllq\t\t\u005c$2,$S1,$S1\n+\tvpsllq\t\t\u005c$2,$S2,$S2\n+\n+\tjmp\t\t.Lmul_init_vpmadd52\n+\tud2\n+\n+.align\t32\n+.Ldone_init_vpmadd52:\n+\tvinserti128\t\u005c$1,%x#$R1,$H1,$R1\t# 1,2,3,4\n+\tvinserti128\t\u005c$1,%x#$R2,$H2,$R2\n+\tvinserti128\t\u005c$1,%x#$R0,$H0,$R0\n+\n+\tvpermq\t\t\u005c$0b11011000,$R1,$R1\t# 1,3,2,4\n+\tvpermq\t\t\u005c$0b11011000,$R2,$R2\n+\tvpermq\t\t\u005c$0b11011000,$R0,$R0\n+\n+\tvpsllq\t\t\u005c$2,$R1,$S1\t\t# S1 \u003d R1*5*4\n+\tvpaddq\t\t$R1,$S1,$S1\n+\tvpsllq\t\t\u005c$2,$S1,$S1\n+\n+\tvmovq\t\t0($ctx),%x#$H0\t\t# load current hash value\n+\tvmovq\t\t8($ctx),%x#$H1\n+\tvmovq\t\t16($ctx),%x#$H2\n+\n+\ttest\t\t\u005c$3,$len\t\t# is length 4*n+2?\n+\tjnz\t\t.Ldone_init_vpmadd52_2x\n+\n+\tvmovdqu64\t$R0,64($ctx)\t\t# save key powers\n+\tvpbroadcastq\t%x#$R0,$R0\t\t# broadcast 4th power\n+\tvmovdqu64\t$R1,96($ctx)\n+\tvpbroadcastq\t%x#$R1,$R1\n+\tvmovdqu64\t$R2,128($ctx)\n+\tvpbroadcastq\t%x#$R2,$R2\n+\tvmovdqu64\t$S1,160($ctx)\n+\tvpbroadcastq\t%x#$S1,$S1\n+\n+\tjmp\t\t.Lblocks_vpmadd52_4x_key_loaded\n+\tud2\n+\n+.align\t32\n+.Ldone_init_vpmadd52_2x:\n+\tvmovdqu64\t$R0,64($ctx)\t\t# save key powers\n+\tvpsrldq\t\t\u005c$8,$R0,$R0\t\t# 0-1-0-2\n+\tvmovdqu64\t$R1,96($ctx)\n+\tvpsrldq\t\t\u005c$8,$R1,$R1\n+\tvmovdqu64\t$R2,128($ctx)\n+\tvpsrldq\t\t\u005c$8,$R2,$R2\n+\tvmovdqu64\t$S1,160($ctx)\n+\tvpsrldq\t\t\u005c$8,$S1,$S1\n+\tjmp\t\t.Lblocks_vpmadd52_2x_key_loaded\n+\tud2\n+\n+.align\t32\n+.Lblocks_vpmadd52_2x_do:\n+\tvmovdqu64\t128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers\n+\tvmovdqu64\t160+8($ctx),${S1}{%k1}{z}\n+\tvmovdqu64\t64+8($ctx),${R0}{%k1}{z}\n+\tvmovdqu64\t96+8($ctx),${R1}{%k1}{z}\n+\n+.Lblocks_vpmadd52_2x_key_loaded:\n+\tvmovdqu64\t16*0($inp),$T2\t\t# load data\n+\tvpxorq\t\t$T3,$T3,$T3\n+\tlea\t\t16*2($inp),$inp\n+\n+\tvpunpcklqdq\t$T3,$T2,$T1\t\t# transpose data\n+\tvpunpckhqdq\t$T3,$T2,$T3\n+\n+\t# at this point 64-bit lanes are ordered as x-1-x-0\n+\n+\tvpsrlq\t\t\u005c$24,$T3,$T2\t\t# splat the data\n+\tvporq\t\t$PAD,$T2,$T2\n+\t vpaddq\t\t$T2,$H2,$H2\t\t# accumulate input\n+\tvpandq\t\t$mask44,$T1,$T0\n+\tvpsrlq\t\t\u005c$44,$T1,$T1\n+\tvpsllq\t\t\u005c$20,$T3,$T3\n+\tvporq\t\t$T3,$T1,$T1\n+\tvpandq\t\t$mask44,$T1,$T1\n+\n+\tjmp\t\t.Ltail_vpmadd52_2x\n+\tud2\n+\n+.align\t32\n+.Loop_vpmadd52_4x:\n+\t#vpaddq\t\t$T2,$H2,$H2\t\t# accumulate input\n+\tvpaddq\t\t$T0,$H0,$H0\n+\tvpaddq\t\t$T1,$H1,$H1\n+\n+\tvpxorq\t\t$D0lo,$D0lo,$D0lo\n+\tvpmadd52luq\t$H2,$S1,$D0lo\n+\tvpxorq\t\t$D0hi,$D0hi,$D0hi\n+\tvpmadd52huq\t$H2,$S1,$D0hi\n+\tvpxorq\t\t$D1lo,$D1lo,$D1lo\n+\tvpmadd52luq\t$H2,$S2,$D1lo\n+\tvpxorq\t\t$D1hi,$D1hi,$D1hi\n+\tvpmadd52huq\t$H2,$S2,$D1hi\n+\tvpxorq\t\t$D2lo,$D2lo,$D2lo\n+\tvpmadd52luq\t$H2,$R0,$D2lo\n+\tvpxorq\t\t$D2hi,$D2hi,$D2hi\n+\tvpmadd52huq\t$H2,$R0,$D2hi\n+\n+\t vmovdqu64\t16*0($inp),$T2\t\t# load data\n+\t vmovdqu64\t16*2($inp),$T3\n+\t lea\t\t16*4($inp),$inp\n+\tvpmadd52luq\t$H0,$R0,$D0lo\n+\tvpmadd52huq\t$H0,$R0,$D0hi\n+\tvpmadd52luq\t$H0,$R1,$D1lo\n+\tvpmadd52huq\t$H0,$R1,$D1hi\n+\tvpmadd52luq\t$H0,$R2,$D2lo\n+\tvpmadd52huq\t$H0,$R2,$D2hi\n+\n+\t vpunpcklqdq\t$T3,$T2,$T1\t\t# transpose data\n+\t vpunpckhqdq\t$T3,$T2,$T3\n+\tvpmadd52luq\t$H1,$S2,$D0lo\n+\tvpmadd52huq\t$H1,$S2,$D0hi\n+\tvpmadd52luq\t$H1,$R0,$D1lo\n+\tvpmadd52huq\t$H1,$R0,$D1hi\n+\tvpmadd52luq\t$H1,$R1,$D2lo\n+\tvpmadd52huq\t$H1,$R1,$D2hi\n+\n+\t################################################################\n+\t# partial reduction (interleaved with data splat)\n+\tvpsrlq\t\t\u005c$44,$D0lo,$tmp\n+\tvpsllq\t\t\u005c$8,$D0hi,$D0hi\n+\tvpandq\t\t$mask44,$D0lo,$H0\n+\tvpaddq\t\t$tmp,$D0hi,$D0hi\n+\n+\t vpsrlq\t\t\u005c$24,$T3,$T2\n+\t vporq\t\t$PAD,$T2,$T2\n+\tvpaddq\t\t$D0hi,$D1lo,$D1lo\n+\n+\tvpsrlq\t\t\u005c$44,$D1lo,$tmp\n+\tvpsllq\t\t\u005c$8,$D1hi,$D1hi\n+\tvpandq\t\t$mask44,$D1lo,$H1\n+\tvpaddq\t\t$tmp,$D1hi,$D1hi\n+\n+\t vpandq\t\t$mask44,$T1,$T0\n+\t vpsrlq\t\t\u005c$44,$T1,$T1\n+\t vpsllq\t\t\u005c$20,$T3,$T3\n+\tvpaddq\t\t$D1hi,$D2lo,$D2lo\n+\n+\tvpsrlq\t\t\u005c$42,$D2lo,$tmp\n+\tvpsllq\t\t\u005c$10,$D2hi,$D2hi\n+\tvpandq\t\t$mask42,$D2lo,$H2\n+\tvpaddq\t\t$tmp,$D2hi,$D2hi\n+\n+\t vpaddq\t$T2,$H2,$H2\t\t# accumulate input\n+\tvpaddq\t\t$D2hi,$H0,$H0\n+\tvpsllq\t\t\u005c$2,$D2hi,$D2hi\n+\n+\tvpaddq\t\t$D2hi,$H0,$H0\n+\t vporq\t\t$T3,$T1,$T1\n+\t vpandq\t\t$mask44,$T1,$T1\n+\n+\tvpsrlq\t\t\u005c$44,$H0,$tmp\t\t# additional step\n+\tvpandq\t\t$mask44,$H0,$H0\n+\n+\tvpaddq\t\t$tmp,$H1,$H1\n+\n+\tsub\t\t\u005c$4,$len\t\t# len-\u003d64\n+\tjnz\t\t.Loop_vpmadd52_4x\n+\n+.Ltail_vpmadd52_4x:\n+\tvmovdqu64\t128($ctx),$R2\t\t# load all key powers\n+\tvmovdqu64\t160($ctx),$S1\n+\tvmovdqu64\t64($ctx),$R0\n+\tvmovdqu64\t96($ctx),$R1\n+\n+.Ltail_vpmadd52_2x:\n+\tvpsllq\t\t\u005c$2,$R2,$S2\t\t# S2 \u003d R2*5*4\n+\tvpaddq\t\t$R2,$S2,$S2\n+\tvpsllq\t\t\u005c$2,$S2,$S2\n+\n+\t#vpaddq\t\t$T2,$H2,$H2\t\t# accumulate input\n+\tvpaddq\t\t$T0,$H0,$H0\n+\tvpaddq\t\t$T1,$H1,$H1\n+\n+\tvpxorq\t\t$D0lo,$D0lo,$D0lo\n+\tvpmadd52luq\t$H2,$S1,$D0lo\n+\tvpxorq\t\t$D0hi,$D0hi,$D0hi\n+\tvpmadd52huq\t$H2,$S1,$D0hi\n+\tvpxorq\t\t$D1lo,$D1lo,$D1lo\n+\tvpmadd52luq\t$H2,$S2,$D1lo\n+\tvpxorq\t\t$D1hi,$D1hi,$D1hi\n+\tvpmadd52huq\t$H2,$S2,$D1hi\n+\tvpxorq\t\t$D2lo,$D2lo,$D2lo\n+\tvpmadd52luq\t$H2,$R0,$D2lo\n+\tvpxorq\t\t$D2hi,$D2hi,$D2hi\n+\tvpmadd52huq\t$H2,$R0,$D2hi\n+\n+\tvpmadd52luq\t$H0,$R0,$D0lo\n+\tvpmadd52huq\t$H0,$R0,$D0hi\n+\tvpmadd52luq\t$H0,$R1,$D1lo\n+\tvpmadd52huq\t$H0,$R1,$D1hi\n+\tvpmadd52luq\t$H0,$R2,$D2lo\n+\tvpmadd52huq\t$H0,$R2,$D2hi\n+\n+\tvpmadd52luq\t$H1,$S2,$D0lo\n+\tvpmadd52huq\t$H1,$S2,$D0hi\n+\tvpmadd52luq\t$H1,$R0,$D1lo\n+\tvpmadd52huq\t$H1,$R0,$D1hi\n+\tvpmadd52luq\t$H1,$R1,$D2lo\n+\tvpmadd52huq\t$H1,$R1,$D2hi\n+\n+\t################################################################\n+\t# horizontal addition\n+\n+\tmov\t\t\u005c$1,%eax\n+\tkmovw\t\t%eax,%k1\n+\tvpsrldq\t\t\u005c$8,$D0lo,$T0\n+\tvpsrldq\t\t\u005c$8,$D0hi,$H0\n+\tvpsrldq\t\t\u005c$8,$D1lo,$T1\n+\tvpsrldq\t\t\u005c$8,$D1hi,$H1\n+\tvpaddq\t\t$T0,$D0lo,$D0lo\n+\tvpaddq\t\t$H0,$D0hi,$D0hi\n+\tvpsrldq\t\t\u005c$8,$D2lo,$T2\n+\tvpsrldq\t\t\u005c$8,$D2hi,$H2\n+\tvpaddq\t\t$T1,$D1lo,$D1lo\n+\tvpaddq\t\t$H1,$D1hi,$D1hi\n+\t vpermq\t\t\u005c$0x2,$D0lo,$T0\n+\t vpermq\t\t\u005c$0x2,$D0hi,$H0\n+\tvpaddq\t\t$T2,$D2lo,$D2lo\n+\tvpaddq\t\t$H2,$D2hi,$D2hi\n+\n+\tvpermq\t\t\u005c$0x2,$D1lo,$T1\n+\tvpermq\t\t\u005c$0x2,$D1hi,$H1\n+\tvpaddq\t\t$T0,$D0lo,${D0lo}{%k1}{z}\n+\tvpaddq\t\t$H0,$D0hi,${D0hi}{%k1}{z}\n+\tvpermq\t\t\u005c$0x2,$D2lo,$T2\n+\tvpermq\t\t\u005c$0x2,$D2hi,$H2\n+\tvpaddq\t\t$T1,$D1lo,${D1lo}{%k1}{z}\n+\tvpaddq\t\t$H1,$D1hi,${D1hi}{%k1}{z}\n+\tvpaddq\t\t$T2,$D2lo,${D2lo}{%k1}{z}\n+\tvpaddq\t\t$H2,$D2hi,${D2hi}{%k1}{z}\n+\n+\t################################################################\n+\t# partial reduction\n+\tvpsrlq\t\t\u005c$44,$D0lo,$tmp\n+\tvpsllq\t\t\u005c$8,$D0hi,$D0hi\n+\tvpandq\t\t$mask44,$D0lo,$H0\n+\tvpaddq\t\t$tmp,$D0hi,$D0hi\n+\n+\tvpaddq\t\t$D0hi,$D1lo,$D1lo\n+\n+\tvpsrlq\t\t\u005c$44,$D1lo,$tmp\n+\tvpsllq\t\t\u005c$8,$D1hi,$D1hi\n+\tvpandq\t\t$mask44,$D1lo,$H1\n+\tvpaddq\t\t$tmp,$D1hi,$D1hi\n+\n+\tvpaddq\t\t$D1hi,$D2lo,$D2lo\n+\n+\tvpsrlq\t\t\u005c$42,$D2lo,$tmp\n+\tvpsllq\t\t\u005c$10,$D2hi,$D2hi\n+\tvpandq\t\t$mask42,$D2lo,$H2\n+\tvpaddq\t\t$tmp,$D2hi,$D2hi\n+\n+\tvpaddq\t\t$D2hi,$H0,$H0\n+\tvpsllq\t\t\u005c$2,$D2hi,$D2hi\n+\n+\tvpaddq\t\t$D2hi,$H0,$H0\n+\n+\tvpsrlq\t\t\u005c$44,$H0,$tmp\t\t# additional step\n+\tvpandq\t\t$mask44,$H0,$H0\n+\n+\tvpaddq\t\t$tmp,$H1,$H1\n+\t\t\t\t\t\t# at this point $len is\n+\t\t\t\t\t\t# either 4*n+2 or 0...\n+\tsub\t\t\u005c$2,$len\t\t# len-\u003d32\n+\tja\t\t.Lblocks_vpmadd52_4x_do\n+\n+\tvmovq\t\t%x#$H0,0($ctx)\n+\tvmovq\t\t%x#$H1,8($ctx)\n+\tvmovq\t\t%x#$H2,16($ctx)\n+\n+.Lno_data_vpmadd52_4x:\n+\tret\n+.size\tpoly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x\n+___\n+}\n $code.\u003d\u003c\u003c___;\n .type\tpoly1305_emit_base2_44,\u005c@function,3\n .align\t32\n@@ -2920,6 +3385,13 @@ $code.\u003d\u003c\u003c___;\n .quad\t44,44,42,64\n .L2_44_shift_lft:\n .quad\t8,8,10,64\n+\n+.Lx_mask44:\n+.quad\t0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff\n+.quad\t0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff\n+.Lx_mask42:\n+.quad\t0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff\n+.quad\t0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff\n ___\n }\n \n","s":{"c":1752657393,"u": 29372}} ],"g": 31365,"chitpc": 0,"ehitpc": 0,"indexed":0 , "ab": 0, "si": 0, "db":0, "di":0, "sat":0, "lfc": "0000"}