{"schema":"libjg2-1",
"vpath":"/git/",
"avatar":"/git/avatar/",
"alang":"",
"gen_ut":1755805050,
"reponame":"openssl",
"desc":"OpenSSL",
"owner": { "name": "Andy Green", "email": "andy@warmcat.com", "md5": "c50933ca2aa61e0fe2c43d46bb6b59cb" },"url":"https://warmcat.com/repo/openssl",
"f":3,
"items": [
{"schema":"libjg2-1",
"cid":"add1e4c467fb66c7d9bcc5d7f5027465",
"commit": {"type":"commit",
"time": 1482679900,
"time_ofs": 60,
"oid_tree": { "oid": "317550e84ada89b0d92f71706a55c3c50d1377e4", "alias": []},
"oid":{ "oid": "3c274a6e2016b6724fbfe3ff1487efa2a536ece4", "alias": []},
"msg": "chacha/asm/chacha-x86_64.pl: add AVX512 path optimized for shorter inputs.",
"sig_commit": { "git_time": { "time": 1482679900, "offset": 60 }, "name": "Andy Polyakov", "email": "appro@openssl.org", "md5": "50bd64fa2a792cbbf679fa16213a3b2a" },
"sig_author": { "git_time": { "time": 1482161195, "offset": 60 }, "name": "Andy Polyakov", "email": "appro@openssl.org", "md5": "50bd64fa2a792cbbf679fa16213a3b2a" }},
"body": "chacha/asm/chacha-x86_64.pl: add AVX512 path optimized for shorter inputs.\n\nReviewed-by: Richard Levitte \u003clevitte@openssl.org\u003e\n"
,
"diff": "diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl\nindex fd3fdeb..ac169ee 100755\n--- a/crypto/chacha/asm/chacha-x86_64.pl\n+++ b/crypto/chacha/asm/chacha-x86_64.pl\n@@ -112,6 +112,10 @@ $code.\u003d\u003c\u003c___;\n .Lsigma:\n .asciz\t\u0022expand 32-byte k\u0022\n .align\t64\n+.Lzeroz:\n+.long\t0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0\n+.Lfourz:\n+.long\t4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0\n .Lincz:\n .long\t0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15\n .Lsixteen:\n@@ -241,6 +245,12 @@ ChaCha20_ctr32:\n \tcmp\t\u005c$0,$len\n \tje\t.Lno_data\n \tmov\tOPENSSL_ia32cap_P+4(%rip),%r10\n+___\n+$code.\u003d\u003c\u003c___\tif ($avx\u003e2);\n+\tbt\t\u005c$48,%r10\t\t# check for AVX512F\n+\tjc\t.LChaCha20_avx512\n+___\n+$code.\u003d\u003c\u003c___;\n \ttest\t\u005c$`1\u003c\u003c(41-32)`,%r10d\n \tjnz\t.LChaCha20_ssse3\n \n@@ -447,7 +457,7 @@ $code.\u003d\u003c\u003c___;\n \tja\t.LChaCha20_4x\t\t# but overall it won't be slower\n \n .Ldo_sse3_after_all:\n-\tpush\t%rbx\n+\tpush\t%rbx\t\t\t# just to share SEH handler, no pops\n \tpush\t%rbp\n \tpush\t%r12\n \tpush\t%r13\n@@ -472,7 +482,7 @@ $code.\u003d\u003c\u003c___;\n \tmovdqa\t$b,0x10(%rsp)\n \tmovdqa\t$c,0x20(%rsp)\n \tmovdqa\t$d,0x30(%rsp)\n-\tmov\t\u005c$10,%ebp\n+\tmov\t\u005c$10,$counter\t\t# reuse $counter\n \tjmp\t.Loop_ssse3\n \n .align\t32\n@@ -482,7 +492,7 @@ $code.\u003d\u003c\u003c___;\n \tmovdqa\t0x10(%rsp),$b\n \tmovdqa\t0x20(%rsp),$c\n \tpaddd\t0x30(%rsp),$d\n-\tmov\t\u005c$10,%ebp\n+\tmov\t\u005c$10,$counter\n \tmovdqa\t$d,0x30(%rsp)\n \tjmp\t.Loop_ssse3\n \n@@ -500,7 +510,7 @@ ___\n \t\u0026pshufd\t($b,$b,0b10010011);\n \t\u0026pshufd\t($d,$d,0b00111001);\n \n-\t\u0026dec\t(\u0022%ebp\u0022);\n+\t\u0026dec\t($counter);\n \t\u0026jnz\t(\u0022.Loop_ssse3\u0022);\n \n $code.\u003d\u003c\u003c___;\n@@ -539,14 +549,14 @@ $code.\u003d\u003c\u003c___;\n \tmovdqa\t$b,0x10(%rsp)\n \tmovdqa\t$c,0x20(%rsp)\n \tmovdqa\t$d,0x30(%rsp)\n-\txor\t%rbx,%rbx\n+\txor\t$counter,$counter\n \n .Loop_tail_ssse3:\n-\tmovzb\t($inp,%rbx),%eax\n-\tmovzb\t(%rsp,%rbx),%ecx\n-\tlea\t1(%rbx),%rbx\n+\tmovzb\t($inp,$counter),%eax\n+\tmovzb\t(%rsp,$counter),%ecx\n+\tlea\t1($counter),$counter\n \txor\t%ecx,%eax\n-\tmov\t%al,-1($out,%rbx)\n+\tmov\t%al,-1($out,$counter)\n \tdec\t$len\n \tjnz\t.Loop_tail_ssse3\n \n@@ -557,13 +567,7 @@ $code.\u003d\u003c\u003c___\tif ($win64);\n \tmovaps\t64+48(%rsp),%xmm7\n ___\n $code.\u003d\u003c\u003c___;\n-\tadd\t\u005c$64+$xframe,%rsp\n-\tpop\t%r15\n-\tpop\t%r14\n-\tpop\t%r13\n-\tpop\t%r12\n-\tpop\t%rbp\n-\tpop\t%rbx\n+\tadd\t\u005c$64+$xframe+48,%rsp\n \tret\n .size\tChaCha20_ssse3,.-ChaCha20_ssse3\n ___\n@@ -1732,12 +1736,6 @@ $code.\u003d\u003c\u003c___;\n .align\t32\n ChaCha20_8x:\n .LChaCha20_8x:\n-___\n-$code.\u003d\u003c\u003c___\t\tif ($avx\u003e2);\n-\ttest\t\t\u005c$`1\u003c\u003c16`,%r10d\t\t\t# check for AVX512F\n-\tjnz\t\t.LChaCha20_16x\n-___\n-$code.\u003d\u003c\u003c___;\n \tmov\t\t%rsp,%r10\n \tsub\t\t\u005c$0x280+$xframe,%rsp\n \tand\t\t\u005c$-32,%rsp\n@@ -2229,7 +2227,7 @@ $code.\u003d\u003c\u003c___;\n \tjnz\t\t.Loop_tail8x\n \n .Ldone8x:\n-\tvzeroupper\n+\tvzeroall\n ___\n $code.\u003d\u003c\u003c___\tif ($win64);\n \tlea\t\t0x290+0x30(%rsp),%r11\n@@ -2254,6 +2252,228 @@ ___\n ########################################################################\n # AVX512 code paths\n if ($avx\u003e2) {\n+# This one handles shorter inputs...\n+\n+my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) \u003d map(\u0022%zmm$_\u0022,(0..3,16..20));\n+my ($t0,$t1,$t2,$t3) \u003d map(\u0022%xmm$_\u0022,(4..7));\n+\n+sub AVX512ROUND {\t# critical path is 14 \u0022SIMD ticks\u0022 per round\n+\t\u0026vpaddd\t($a,$a,$b);\n+\t\u0026vpxord\t($d,$d,$a);\n+\t\u0026vprold\t($d,$d,16);\n+\n+\t\u0026vpaddd\t($c,$c,$d);\n+\t\u0026vpxord\t($b,$b,$c);\n+\t\u0026vprold\t($b,$b,12);\n+\n+\t\u0026vpaddd\t($a,$a,$b);\n+\t\u0026vpxord\t($d,$d,$a);\n+\t\u0026vprold\t($d,$d,8);\n+\n+\t\u0026vpaddd\t($c,$c,$d);\n+\t\u0026vpxord\t($b,$b,$c);\n+\t\u0026vprold\t($b,$b,7);\n+}\n+\n+my $xframe \u003d $win64 ? 32+32+8 : 24;\n+\n+$code.\u003d\u003c\u003c___;\n+.type\tChaCha20_avx512,\u005c@function,5\n+.align\t32\n+ChaCha20_avx512:\n+.LChaCha20_avx512:\n+\tcmp\t\u005c$512,$len\n+\tja\t.LChaCha20_16x\n+\n+\tpush\t%rbx\t\t\t# just to share SEH handler, no pops\n+\tpush\t%rbp\n+\tpush\t%r12\n+\tpush\t%r13\n+\tpush\t%r14\n+\tpush\t%r15\n+\n+\tsub\t\u005c$64+$xframe,%rsp\n+___\n+$code.\u003d\u003c\u003c___\tif ($win64);\n+\tmovaps\t%xmm6,64+32(%rsp)\n+\tmovaps\t%xmm7,64+48(%rsp)\n+___\n+$code.\u003d\u003c\u003c___;\n+\tvbroadcasti32x4\t.Lsigma(%rip),$a\n+\tvbroadcasti32x4\t($key),$b\n+\tvbroadcasti32x4\t16($key),$c\n+\tvbroadcasti32x4\t($counter),$d\n+\n+\tvmovdqa32\t$a,$a_\n+\tvmovdqa32\t$b,$b_\n+\tvmovdqa32\t$c,$c_\n+\tvpaddd\t\t.Lzeroz(%rip),$d,$d\n+\tvmovdqa32\t.Lfourz(%rip),$fourz\n+\tmov\t\t\u005c$10,$counter\t# reuse $counter\n+\tvmovdqa32\t$d,$d_\n+\tjmp\t\t.Loop_avx512\n+\n+.align\t16\n+.Loop_outer_avx512:\n+\tvmovdqa32\t$a_,$a\n+\tvmovdqa32\t$b_,$b\n+\tvmovdqa32\t$c_,$c\n+\tvpaddd\t\t$fourz,$d_,$d\n+\tmov\t\t\u005c$10,$counter\n+\tvmovdqa32\t$d,$d_\n+\tjmp\t\t.Loop_avx512\n+\n+.align\t32\n+.Loop_avx512:\n+___\n+\t\u0026AVX512ROUND();\n+\t\u0026vpshufd\t($c,$c,0b01001110);\n+\t\u0026vpshufd\t($b,$b,0b00111001);\n+\t\u0026vpshufd\t($d,$d,0b10010011);\n+\n+\t\u0026AVX512ROUND();\n+\t\u0026vpshufd\t($c,$c,0b01001110);\n+\t\u0026vpshufd\t($b,$b,0b10010011);\n+\t\u0026vpshufd\t($d,$d,0b00111001);\n+\n+\t\u0026dec\t\t($counter);\n+\t\u0026jnz\t\t(\u0022.Loop_avx512\u0022);\n+\n+$code.\u003d\u003c\u003c___;\n+\tvpaddd\t\t$a_,$a,$a\n+\tvpaddd\t\t$b_,$b,$b\n+\tvpaddd\t\t$c_,$c,$c\n+\tvpaddd\t\t$d_,$d,$d\n+\n+\tsub\t\t\u005c$64,$len\n+\tjb\t\t.Ltail64_avx512\n+\n+\tvpxor\t\t0x00($inp),%x#$a,$t0\t# xor with input\n+\tvpxor\t\t0x10($inp),%x#$b,$t1\n+\tvpxor\t\t0x20($inp),%x#$c,$t2\n+\tvpxor\t\t0x30($inp),%x#$d,$t3\n+\tlea\t\t0x40($inp),$inp\t\t# inp+\u003d64\n+\n+\tvmovdqu\t\t$t0,0x00($out)\t\t# write output\n+\tvmovdqu\t\t$t1,0x10($out)\n+\tvmovdqu\t\t$t2,0x20($out)\n+\tvmovdqu\t\t$t3,0x30($out)\n+\tlea\t\t0x40($out),$out\t\t# out+\u003d64\n+\n+\tjz\t\t.Ldone_avx512\n+\n+\tvextracti32x4\t\u005c$1,$a,$t0\n+\tvextracti32x4\t\u005c$1,$b,$t1\n+\tvextracti32x4\t\u005c$1,$c,$t2\n+\tvextracti32x4\t\u005c$1,$d,$t3\n+\n+\tsub\t\t\u005c$64,$len\n+\tjb\t\t.Ltail_avx512\n+\n+\tvpxor\t\t0x00($inp),$t0,$t0\t# xor with input\n+\tvpxor\t\t0x10($inp),$t1,$t1\n+\tvpxor\t\t0x20($inp),$t2,$t2\n+\tvpxor\t\t0x30($inp),$t3,$t3\n+\tlea\t\t0x40($inp),$inp\t\t# inp+\u003d64\n+\n+\tvmovdqu\t\t$t0,0x00($out)\t\t# write output\n+\tvmovdqu\t\t$t1,0x10($out)\n+\tvmovdqu\t\t$t2,0x20($out)\n+\tvmovdqu\t\t$t3,0x30($out)\n+\tlea\t\t0x40($out),$out\t\t# out+\u003d64\n+\n+\tjz\t\t.Ldone_avx512\n+\n+\tvextracti32x4\t\u005c$2,$a,$t0\n+\tvextracti32x4\t\u005c$2,$b,$t1\n+\tvextracti32x4\t\u005c$2,$c,$t2\n+\tvextracti32x4\t\u005c$2,$d,$t3\n+\n+\tsub\t\t\u005c$64,$len\n+\tjb\t\t.Ltail_avx512\n+\n+\tvpxor\t\t0x00($inp),$t0,$t0\t# xor with input\n+\tvpxor\t\t0x10($inp),$t1,$t1\n+\tvpxor\t\t0x20($inp),$t2,$t2\n+\tvpxor\t\t0x30($inp),$t3,$t3\n+\tlea\t\t0x40($inp),$inp\t\t# inp+\u003d64\n+\n+\tvmovdqu\t\t$t0,0x00($out)\t\t# write output\n+\tvmovdqu\t\t$t1,0x10($out)\n+\tvmovdqu\t\t$t2,0x20($out)\n+\tvmovdqu\t\t$t3,0x30($out)\n+\tlea\t\t0x40($out),$out\t\t# out+\u003d64\n+\n+\tjz\t\t.Ldone_avx512\n+\n+\tvextracti32x4\t\u005c$3,$a,$t0\n+\tvextracti32x4\t\u005c$3,$b,$t1\n+\tvextracti32x4\t\u005c$3,$c,$t2\n+\tvextracti32x4\t\u005c$3,$d,$t3\n+\n+\tsub\t\t\u005c$64,$len\n+\tjb\t\t.Ltail_avx512\n+\n+\tvpxor\t\t0x00($inp),$t0,$t0\t# xor with input\n+\tvpxor\t\t0x10($inp),$t1,$t1\n+\tvpxor\t\t0x20($inp),$t2,$t2\n+\tvpxor\t\t0x30($inp),$t3,$t3\n+\tlea\t\t0x40($inp),$inp\t\t# inp+\u003d64\n+\n+\tvmovdqu\t\t$t0,0x00($out)\t\t# write output\n+\tvmovdqu\t\t$t1,0x10($out)\n+\tvmovdqu\t\t$t2,0x20($out)\n+\tvmovdqu\t\t$t3,0x30($out)\n+\tlea\t\t0x40($out),$out\t\t# out+\u003d64\n+\n+\tjnz\t\t.Loop_outer_avx512\n+\n+\tjmp\t\t.Ldone_avx512\n+\n+.align\t16\n+.Ltail64_avx512:\n+\tvmovdqa\t\t%x#$a,0x00(%rsp)\n+\tvmovdqa\t\t%x#$b,0x10(%rsp)\n+\tvmovdqa\t\t%x#$c,0x20(%rsp)\n+\tvmovdqa\t\t%x#$d,0x30(%rsp)\n+\tadd\t\t\u005c$64,$len\n+\tjmp\t\t.Loop_tail_avx512\n+\n+.align\t16\n+.Ltail_avx512:\n+\tvmovdqa\t\t$t0,0x00(%rsp)\n+\tvmovdqa\t\t$t1,0x10(%rsp)\n+\tvmovdqa\t\t$t2,0x20(%rsp)\n+\tvmovdqa\t\t$t3,0x30(%rsp)\n+\tadd\t\t\u005c$64,$len\n+\n+.Loop_tail_avx512:\n+\tmovzb\t\t($inp,$counter),%eax\n+\tmovzb\t\t(%rsp,$counter),%ecx\n+\tlea\t\t1($counter),$counter\n+\txor\t\t%ecx,%eax\n+\tmov\t\t%al,-1($out,$counter)\n+\tdec\t\t$len\n+\tjnz\t\t.Loop_tail_avx512\n+\n+\tvmovdqa32\t$a_,0x00(%rsp)\n+\n+.Ldone_avx512:\n+\tvzeroall\n+___\n+$code.\u003d\u003c\u003c___\tif ($win64);\n+\tmovaps\t64+32(%rsp),%xmm6\n+\tmovaps\t64+48(%rsp),%xmm7\n+___\n+$code.\u003d\u003c\u003c___;\n+\tadd\t\u005c$64+$xframe+48,%rsp\n+\tret\n+.size\tChaCha20_avx512,.-ChaCha20_avx512\n+___\n+}\n+if ($avx\u003e2) {\n+# This one handles longer inputs...\n+\n my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,\n $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)\u003dmap(\u0022%zmm$_\u0022,(0..15));\n my @xx\u003d($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,\n@@ -2728,8 +2948,11 @@ $code.\u003d\u003c\u003c___;\n \tdec\t\t$len\n \tjnz\t\t.Loop_tail16x\n \n+\tvpxord\t\t$xa0,$xa0,$xa0\n+\tvmovdqa32\t$xa0,0(%rsp)\n+\n .Ldone16x:\n-\tvzeroupper\n+\tvzeroall\n ___\n $code.\u003d\u003c\u003c___\tif ($win64);\n \tlea\t\t0x290+0x30(%rsp),%r11\n@@ -2752,9 +2975,9 @@ ___\n }\n \n foreach (split(\u0022\u005cn\u0022,$code)) {\n-\ts/\u005c`([^\u005c`]*)\u005c`/eval $1/geo;\n+\ts/\u005c`([^\u005c`]*)\u005c`/eval $1/ge;\n \n-\ts/%x#%y/%x/go;\n+\ts/%x#%[yz]/%x/g;\t# \u0022down-shift\u0022\n \n \tprint $_,\u0022\u005cn\u0022;\n }\n","s":{"c":1755805050,"u": 23040}}
],"g": 25138,"chitpc": 0,"ehitpc": 0,"indexed":0
,
"ab": 0, "si": 0, "db":0, "di":0, "sat":0, "lfc": "0000"}