|
24 | 24 | #include <memory> |
25 | 25 | #include <vector> |
26 | 26 |
|
| 27 | + |
27 | 28 | #include "compression/compress.h" // IWYU pragma: export |
28 | 29 | #include "compression/distortion.h" |
29 | 30 | #include "util/threading_context.h" |
@@ -444,6 +445,146 @@ struct CompressTraits<SfpStream> { |
444 | 445 | } |
445 | 446 | }; |
446 | 447 |
|
| 448 | +template <> |
| 449 | +struct CompressTraits<int8_t> { |
| 450 | + using Packed = int8_t; |
| 451 | + |
| 452 | + static size_t CompressBound(size_t num) { return num * sizeof(Packed); } |
| 453 | + |
| 454 | + template <class DF, HWY_IF_F32_D(DF)> |
| 455 | + static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT raw, |
| 456 | + size_t num, CompressPerThread& /*tls*/, |
| 457 | + const PackedSpan<Packed>& packed, |
| 458 | + const size_t packed_ofs) { |
| 459 | + const hn::Repartition<int32_t, DF> di32; |
| 460 | + const hn::Repartition<int16_t, DF> di16; |
| 461 | + const hn::Repartition<int8_t, DF> di8; |
| 462 | + using VF = hn::Vec<DF>; |
| 463 | + const size_t NF = hn::Lanes(df); |
| 464 | + |
| 465 | + size_t i = 0; |
| 466 | + for (; i <= num - NF; i += NF) { |
| 467 | + const VF v = hn::LoadU(df, raw + i); |
| 468 | + auto vi32 = hn::NearestInt(v); |
| 469 | + auto vi16 = hn::DemoteTo(di16, vi32); |
| 470 | + auto vi8 = hn::DemoteTo(di8, vi16); |
| 471 | + hn::StoreU(vi8, di8, packed.ptr + packed_ofs + i); |
| 472 | + } |
| 473 | + const size_t remaining = num - i; |
| 474 | + if (remaining > 0) { |
| 475 | + const VF v = hn::LoadN(df, raw + i, remaining); |
| 476 | + auto vi32 = hn::NearestInt(v); |
| 477 | + auto vi16 = hn::DemoteTo(di16, vi32); |
| 478 | + auto vi8 = hn::DemoteTo(di8, vi16); |
| 479 | + hn::StoreN(vi8, di8, packed.ptr + packed_ofs + i, remaining); |
| 480 | + } |
| 481 | + } |
| 482 | + |
| 483 | + static float ToFloatSlow(const Packed x) { return static_cast<float>(x); } |
| 484 | + |
| 485 | + |
| 486 | + template <class DF, HWY_IF_F32_D(DF)> |
| 487 | + static HWY_INLINE void Load2(DF df, const PackedSpan<const Packed>& packed, |
| 488 | + const size_t packed_ofs, hn::Vec<DF>& raw0, |
| 489 | + hn::Vec<DF>& raw1) { |
| 490 | + const hn::Repartition<int32_t, DF> di32; |
| 491 | + const hn::Repartition<int16_t, DF> di16; |
| 492 | + const hn::Repartition<int8_t, DF> di8; |
| 493 | + const hn::Half<decltype(di8)> di8_half; |
| 494 | + |
| 495 | + const auto vec_i8 = hn::LoadU(di8_half, packed.ptr + packed_ofs); |
| 496 | + const auto vec_i8_full = hn::Combine(di8, hn::Zero(di8_half), vec_i8); |
| 497 | + const auto vec_i16 = hn::PromoteLowerTo(di16, vec_i8_full); |
| 498 | + const auto vec_i32_0 = hn::PromoteLowerTo(di32, vec_i16); |
| 499 | + const auto vec_i32_1 = hn::PromoteUpperTo(di32, vec_i16); |
| 500 | + |
| 501 | + raw0 = hn::ConvertTo(df, vec_i32_0); |
| 502 | + raw1 = hn::ConvertTo(df, vec_i32_1); |
| 503 | + } |
| 504 | + |
| 505 | + template <class DBF, HWY_IF_BF16_D(DBF)> |
| 506 | + static HWY_INLINE void Load2(DBF dbf, const PackedSpan<const Packed>& packed, |
| 507 | + const size_t packed_ofs, hn::Vec<DBF>& raw0, |
| 508 | + hn::Vec<DBF>& raw1) { |
| 509 | + const hn::Repartition<float, DBF> df; |
| 510 | + const hn::Repartition<int32_t, DBF> di32; |
| 511 | + const hn::Repartition<int16_t, DBF> di16; |
| 512 | + const hn::Repartition<int8_t, DBF> di8; |
| 513 | + |
| 514 | + const auto v8 = hn::LoadU(di8, packed.ptr + packed_ofs); |
| 515 | + |
| 516 | + const auto v16_0 = hn::PromoteLowerTo(di16, v8); |
| 517 | + const auto v16_1 = hn::PromoteUpperTo(di16, v8); |
| 518 | + |
| 519 | + const auto v32_0_lo = hn::PromoteLowerTo(di32, v16_0); |
| 520 | + const auto v32_0_hi = hn::PromoteUpperTo(di32, v16_0); |
| 521 | + const auto f0_lo = hn::ConvertTo(df, v32_0_lo); |
| 522 | + const auto f0_hi = hn::ConvertTo(df, v32_0_hi); |
| 523 | + raw0 = hn::OrderedDemote2To(dbf, f0_lo, f0_hi); |
| 524 | + |
| 525 | + const auto v32_1_lo = hn::PromoteLowerTo(di32, v16_1); |
| 526 | + const auto v32_1_hi = hn::PromoteUpperTo(di32, v16_1); |
| 527 | + const auto f1_lo = hn::ConvertTo(df, v32_1_lo); |
| 528 | + const auto f1_hi = hn::ConvertTo(df, v32_1_hi); |
| 529 | + raw1 = hn::OrderedDemote2To(dbf, f1_lo, f1_hi); |
| 530 | + } |
| 531 | + |
| 532 | + template <class DF, HWY_IF_F32_D(DF)> |
| 533 | + static HWY_INLINE void DecompressAndZeroPad( |
| 534 | + DF df, const PackedSpan<const Packed>& packed, const size_t packed_ofs, |
| 535 | + float* HWY_RESTRICT raw, size_t num) { |
| 536 | + const hn::Rebind<int32_t, DF> di32; |
| 537 | + const hn::Rebind<int16_t, DF> di16; |
| 538 | + const hn::Rebind<int8_t, DF> di8; |
| 539 | + using VF = hn::Vec<DF>; |
| 540 | + const size_t NF = hn::Lanes(df); |
| 541 | + |
| 542 | + size_t i = 0; |
| 543 | + if (num >= 2 * NF) { |
| 544 | + for (; i <= num - 2 * NF; i += 2 * NF) { |
| 545 | + VF raw0, raw1; |
| 546 | + Load2(df, packed, packed_ofs + i, raw0, raw1); |
| 547 | + hn::StoreU(raw0, df, raw + i); |
| 548 | + hn::StoreU(raw1, df, raw + i + NF); |
| 549 | + } |
| 550 | + } |
| 551 | + |
| 552 | + const size_t remaining = num - i; |
| 553 | + if (HWY_UNLIKELY(remaining != 0)) { |
| 554 | + for (size_t j = 0; j < remaining; ++j) { |
| 555 | + raw[i + j] = static_cast<float>(packed.ptr[packed_ofs + i + j]); |
| 556 | + } |
| 557 | + } |
| 558 | + } |
| 559 | + |
| 560 | + template <class DBF, HWY_IF_BF16_D(DBF)> |
| 561 | + static HWY_INLINE void DecompressAndZeroPad( |
| 562 | + DBF dbf, const PackedSpan<const Packed>& packed, const size_t packed_ofs, |
| 563 | + BF16* HWY_RESTRICT raw, size_t num) { |
| 564 | + const hn::Repartition<float, DBF> df; |
| 565 | + const size_t NF = hn::Lanes(df); |
| 566 | + size_t i = 0; |
| 567 | + const size_t NBF = hn::Lanes(dbf); |
| 568 | + if (num >= NBF) { |
| 569 | + for (; i <= num - NBF; i += NBF) { |
| 570 | + hn::Vec<decltype(df)> f0, f1; |
| 571 | + Load2(df, packed, packed_ofs + i, f0, f1); |
| 572 | + auto vbf = hn::OrderedDemote2To(dbf, f0, f1); |
| 573 | + hn::StoreU(vbf, dbf, raw + i); |
| 574 | + } |
| 575 | + } |
| 576 | + const size_t remaining = num - i; |
| 577 | + if (remaining > 0) { |
| 578 | + HWY_ALIGN float buf[2 * hn::MaxLanes(df)]; |
| 579 | + DecompressAndZeroPad(df, packed, packed_ofs + i, buf, remaining); |
| 580 | + auto f0 = hn::LoadU(df, buf); |
| 581 | + auto f1 = hn::LoadU(df, buf + NF); |
| 582 | + auto vbf = hn::OrderedDemote2To(dbf, f0, f1); |
| 583 | + hn::StoreN(vbf, dbf, raw + i, remaining); |
| 584 | + } |
| 585 | + } |
| 586 | +}; |
| 587 | + |
447 | 588 | // Integer quantization. |
448 | 589 | template <> |
449 | 590 | struct CompressTraits<I8Stream> { |
|
0 commit comments