diff --git a/.container_build_image b/.container_build_image new file mode 100644 index 0000000000000..e8712bc3cb180 --- /dev/null +++ b/.container_build_image @@ -0,0 +1 @@ +rocky-8-kernel-builder diff --git a/.github/workflows/build-check_aarch64.yml b/.github/workflows/build-check_aarch64.yml index e9b915067c02e..f9af175c4659e 100644 --- a/.github/workflows/build-check_aarch64.yml +++ b/.github/workflows/build-check_aarch64.yml @@ -32,3 +32,8 @@ jobs: cp configs/kernel-aarch64.config .config make olddefconfig make -j8 + - name: Check kabi + run: | + git clone --branch r8 --single-branch https://git.rockylinux.org/staging/rpms/kernel.git kernel-dist-git + git -C kernel-dist-git reset --hard imports/r8/kernel-4.18.0-553.16.1.el8_10 + ./kernel-dist-git/SOURCES/check-kabi -k ./kernel-dist-git/SOURCES/Module.kabi_aarch64 -s Module.symvers diff --git a/.github/workflows/build-check_x86_64.yml b/.github/workflows/build-check_x86_64.yml index 033208cc7fdf1..18f4dc7459a5a 100644 --- a/.github/workflows/build-check_x86_64.yml +++ b/.github/workflows/build-check_x86_64.yml @@ -32,3 +32,8 @@ jobs: cp configs/kernel-x86_64.config .config make olddefconfig make -j8 + - name: Check kabi + run: | + git clone --branch r8 --single-branch https://git.rockylinux.org/staging/rpms/kernel.git kernel-dist-git + git -C kernel-dist-git reset --hard imports/r8/kernel-4.18.0-553.16.1.el8_10 + ./kernel-dist-git/SOURCES/check-kabi -k ./kernel-dist-git/SOURCES/Module.kabi_x86_64 -s Module.symvers diff --git a/.github/workflows/diffdiff.py b/.github/workflows/diffdiff.py new file mode 100755 index 0000000000000..dc2c5ab0d1e9d --- /dev/null +++ b/.github/workflows/diffdiff.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# + +import argparse +import copy +import difflib +import io +import git +import os +import re +import subprocess +import sys +import tempfile + +verbose = False + + +def get_upstream_commit(upstream, c): + for l in c.message.splitlines(): + try: + sha = re.match('\s*commit\s+(?P\S+)', l).groups()[0].upper() + return upstream.commit(sha) + except: + True + +def get_diff(d): + dif = '' + df = False + for l in d.splitlines(): + if l[:10] == 'diff --git': + df = True + if not df: + continue + dif = dif + l + '\n' + return dif + + +def trim_unchanged_files(lines): + dl = [] + ld = 0 # Last line with a 'diff --git' we saw + hd = False # Have we seen a changed line since ld? + i = 0 + for i, l in enumerate(lines): + if l[:4] == '+++ ' or l[:4] == '--- ' : + continue + if l[0] == '+' or l[0] == '-': + hd = True + if l[:11] == ' diff --git': + if ld: # We are at a new diff now, last one started at 'ld' + if not hd: + dl.insert(0, (ld, i+1),) + ld = i + hd = False # Reset hasdiff to False as we start a new section + # and check the tail + if not hd: + dl.insert(0, (ld, i+1),) + # delete the unchanged file sections + for d in dl: + del lines[d[0]:d[1]] + return lines + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-v', action='store_true', help='Verbose') + parser.add_argument('--colour', action='store_true', help='Colorize the diff. Green for additions, red for deletions') + parser.add_argument('--commit', help='Commit in current tree to diffdiff. Default is the most recent commit.') + parser.add_argument('--upstream', help='A directory that contains the current upstream of linus kernel tree where we can find the commits we reference. Default is the current repo') + args = parser.parse_args() + + + if args.v: + verbose = True + + srcgit = git.Repo.init('.') + upstream = git.Repo.init(args.upstream) + c = srcgit.head.commit if not args.commit else srcgit.commit(args.commit) + uc = get_upstream_commit(upstream, c) + + dc = get_diff(srcgit.git.show(c)) + duc = get_diff(upstream.git.show(uc)) + + with open('c.diff', 'w') as f: + f.write(dc) + with open('u.diff', 'w') as f: + f.write(duc) + + res = subprocess.run(['diff', '-u', 'u.diff', 'c.diff'], + check=False, stdout=subprocess.PIPE) + lines = res.stdout.splitlines() + dd = [] + for l in lines: + l = str(l)[2:-1] + if l[:6] == '-index': + continue + if l[:6] == '+index': + continue + if l[:3] == '-@@': + continue + if l[:3] == '+@@': + dd.append(' ' + l[1:]) + continue + dd.append(l) + + # trim diffs for files that did not change + lines = trim_unchanged_files(dd) + + # colorize the diff + diffs = 0 + if args.colour: + dd = [] + for l in lines: + if l[0:4] != '+++ ' and l[0:4] != '--- ': + if l[0] == '+': + l = '\033[42m' + l + '\033[0m' + diffs = diffs + 1 + if l[0] == '-': + l = '\033[41m' + l + '\033[0m' + diffs = diffs + 1 + dd.append(l) + lines = dd + + + if diffs: + for l in lines: + print(l) + + sys.exit(diffs) diff --git a/.github/workflows/kernel-build-and-test-multiarch.yml b/.github/workflows/kernel-build-and-test-multiarch.yml new file mode 100644 index 0000000000000..f5177d72d4948 --- /dev/null +++ b/.github/workflows/kernel-build-and-test-multiarch.yml @@ -0,0 +1,11 @@ +name: Automated kernel build and test x86_64 and aarch64 + +on: + push: + branches: + - '*_rlc-8/**' + +jobs: + kernelCI: + uses: ctrliq/kernel-src-tree/.github/workflows/kernel-build-and-test-multiarch.yml@main + secrets: inherit diff --git a/.github/workflows/process-git-request.rb b/.github/workflows/process-git-request.rb new file mode 100644 index 0000000000000..9be1869d51087 --- /dev/null +++ b/.github/workflows/process-git-request.rb @@ -0,0 +1,140 @@ +require 'open3' + +requestors = { "gvrose8192" => "" } + +def file_prepend(file, str) + new_contents = "" + File.open(file, 'r') do |fd| + contents = fd.read + new_contents = str << contents + end + # Overwrite file but now with prepended string on it + File.open(file, 'w') do |fd| + fd.write(new_contents) + end +end + +def process_git_request(fname, target_branch, source_branch, prj_dir) + retcode = 200 #presume success +# puts "Opening file " + fname + file = File.new(fname, "w") + working_dir = prj_dir +# puts "Working Dir : " + working_dir + Dir.chdir working_dir +# puts "pwd : " + Dir.pwd + git_cmd = "git log --oneline --no-abbrev-commit base_repo/" + target_branch + ".." + "origin/" + source_branch +# puts git_cmd + out, err, status = Open3.capture3(git_cmd) + if status.exitstatus != 0 + puts "Command error output is " + err + file.write("Command error output is " + err) + file.close + retcode = 201 + return retcode + end + output_lines = out.split(' ') +# we just want the commit sha IDs + output_lines.each { |x| +# puts "This is output_lines " + x + upstream_diff = false + if !x[/\H/] + if x.length < 40 + next + end + git_cmd = "git show " + x + gitlog_out, gitlog_err, gitlog_status = Open3.capture3(git_cmd) + if gitlog_status.exitstatus != 0 + file.write("git show command error output is " + gitlog_err) + retcode = 201 + end + loglines = gitlog_out.lines.map(&:chomp) + lines_counted = 0 + local_diffdiff_sha = "" + upstream_diffdiff_sha = "" + loglines.each { |logline| + lines_counted = lines_counted + 1 + if lines_counted == 1 + local_commit_sha = logline.match("[0-9a-f]\{40\}") + local_diffdiff_sha = local_commit_sha.to_s +# puts "Local : " + local_diffdiff_sha + file.write("Merge Request sha: " + local_diffdiff_sha) + file.write("\n") + end + if lines_counted == 2 #email address + if !logline.downcase.include? "ciq.com" + # Bad Author + s = "error:\nBad " + logline + "\n" + puts s + file.write(s) + retcode = 201 + else + file.write("\t" + logline + "\n") + end + end + if lines_counted > 1 + if logline.downcase.include? "jira" + file.write("\t" + logline + "\n") + end + if logline.downcase.include? "upstream-diff" + upstream_diff = true + end + if logline.downcase.include? "commit" + commit_sha = logline.match("[0-9a-f]\{40\}") + upstream_diffdiff_sha = commit_sha.to_s +# puts "Upstream : " + upstream_diffdiff_sha + if (!upstream_diffdiff_sha.empty?) + file.write("\tUpstream sha: " + upstream_diffdiff_sha) + file.write("\n") + end + end + end + if lines_counted > 8 #Everything we need should be in the first 8 lines + break + end + } + if !local_diffdiff_sha.empty? && !upstream_diffdiff_sha.empty? + diff_cmd = Dir.pwd + "/.github/workflows/diffdiff.py --colour --commit " + local_diffdiff_sha + puts "diffdiff: " + diff_cmd + diff_out, diff_err, diff_status = Open3.capture3(diff_cmd) + if diff_status.exitstatus != 0 && !upstream_diff + puts "diffdiff out: " + diff_out + puts "diffdiff err: " + diff_err + retcode = 201 + file.write("error:\nCommit: " + local_diffdiff_sha + " differs with no upstream tag in commit message\n") + end + end + end + } + file.close + return retcode +end + +first_arg, *argv_in = ARGV +if argv_in.length < 5 + puts "Not enough arguments: fname, target_branch, source_branch, prj_dir, pull_request, requestor" + exit +end +fname = first_arg.to_s +fname = "tmp-" + fname +# puts "filename is " + fname +target_branch = argv_in[0].to_s +# puts "target branch is " + target_branch +source_branch = argv_in[1].to_s +# puts "source branch is " + source_branch +prj_dir = argv_in[2].to_s +# puts "project dir is " + prj_dir +pullreq = argv_in[3].to_s +# puts "pull request is " + pullreq +requestor = argv_in[4].to_s +retcode = process_git_request(fname, target_branch, source_branch, prj_dir) +if retcode != 200 + File.open(fname, 'r') do |fd| + contents = fd.read + puts contents + end + exit(1) +else + puts "Done" +end +exit(0) + diff --git a/.github/workflows/validate-kernel-commits.yml b/.github/workflows/validate-kernel-commits.yml new file mode 100644 index 0000000000000..c74434336e251 --- /dev/null +++ b/.github/workflows/validate-kernel-commits.yml @@ -0,0 +1,10 @@ +name: Validate Kernel Commits + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + check: + uses: ctrliq/kernel-src-tree/.github/workflows/validate-kernel-commits.yml@main + secrets: inherit diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4130d43c6435a..c4cb3f660b838 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1021,17 +1021,34 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - if (c->extended_cpuid_level >= 0x80000008) { + if (!cpu_has(c, X86_FEATURE_CPUID) || + (c->extended_cpuid_level < 0x80000008)) { + if (IS_ENABLED(CONFIG_X86_64)) { + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; + } else { + c->x86_clflush_size = 32; + c->x86_virt_bits = 32; + c->x86_phys_bits = 32; + + if (cpu_has(c, X86_FEATURE_PAE) || + cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; + } + } else { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif + c->x86_cache_bits = c->x86_phys_bits; + c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1485,17 +1502,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; - c->x86_phys_bits = 36; - c->x86_virt_bits = 48; -#else - c->x86_clflush_size = 32; - c->x86_phys_bits = 32; - c->x86_virt_bits = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; @@ -1505,8 +1511,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_vendor(c); get_cpu_cap(c); get_model_name(c); /* RHEL8: get model name for unsupported check */ - get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); cpu_parse_early_param(); if (this_cpu->c_early_init) @@ -1520,6 +1526,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } else { identify_cpu_without_cpuid(c); setup_clear_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); } setup_force_cpu_cap(X86_FEATURE_ALWAYS); diff --git a/configs/kernel-x86_64.config b/configs/kernel-x86_64.config index 626a6d72a64f0..af22210c4aadb 100644 --- a/configs/kernel-x86_64.config +++ b/configs/kernel-x86_64.config @@ -316,12 +316,10 @@ # CONFIG_CRYPTO_AEGIS128_AESNI_SSE2 is not set # CONFIG_CRYPTO_AEGIS256 is not set # CONFIG_CRYPTO_AEGIS256_AESNI_SSE2 is not set -# CONFIG_CRYPTO_AES_TI is not set # CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set # CONFIG_CRYPTO_DEV_CCREE is not set # CONFIG_CRYPTO_DEV_CHELSIO_TLS is not set # CONFIG_CRYPTO_DEV_VIRTIO is not set -# CONFIG_CRYPTO_KEYWRAP is not set # CONFIG_CRYPTO_LZ4 is not set # CONFIG_CRYPTO_LZ4HC is not set # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set @@ -2676,6 +2674,7 @@ CONFIG_CROSS_MEMORY_ATTACH=y CONFIG_CRYPTO=y CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_NI_INTEL=y +CONFIG_CRYPTO_AES_TI=y CONFIG_CRYPTO_AES_X86_64=y CONFIG_CRYPTO_ANSI_CPRNG=m CONFIG_CRYPTO_ANUBIS=m @@ -2745,6 +2744,7 @@ CONFIG_CRYPTO_GHASH=y CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_KEYWRAP=y CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_MANAGER=y @@ -2768,7 +2768,7 @@ CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA1_SSSE3=y CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA256_SSSE3=y -CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SHA3=y CONFIG_CRYPTO_SHA512=y CONFIG_CRYPTO_SHA512_SSSE3=y CONFIG_CRYPTO_TEA=m @@ -5851,3 +5851,5 @@ CONFIG_ZRAM_WRITEBACK=y CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_ZSWAP=y +CONFIG_CRYPTO_FIPS_CUSTOM_VERSION=y +CONFIG_CRYPTO_FIPS_VERSION="rocky8.20240923" diff --git a/crypto/Kconfig b/crypto/Kconfig index 4591652cc086c..e9b9878043ea9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1777,6 +1777,7 @@ config CRYPTO_ANSI_CPRNG tristate "Pseudo Random Number Generation for Cryptographic modules" select CRYPTO_AES select CRYPTO_RNG + select CRYPTO_SHA3 help This option enables the generic pseudo random number generator for cryptographic modules. Uses the Algorithm specified in diff --git a/crypto/aead.c b/crypto/aead.c index 60b3bbe973e75..033aa5e9584c3 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -45,8 +45,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 4a967b545f4ab..c051916775a0c 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -139,7 +139,7 @@ int public_key_verify_signature(const struct public_key *pkey, ret = -EKEYREJECTED; out_free_output: - kfree(output); + kfree_sensitive(output); error_free_req: akcipher_request_free(req); error_free_tfm: diff --git a/crypto/cipher.c b/crypto/cipher.c index 57836c30a49a6..ba4193ba237bf 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -38,8 +38,7 @@ static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } diff --git a/crypto/dh.c b/crypto/dh.c index 1bcb47c90055a..57851d7ad4ac0 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -218,10 +218,35 @@ static int dh_compute_value(struct kpp_request *req) /* SP800-56A rev 3 5.6.2.1.3 key check */ } else { + MPI val_pct; + if (dh_is_pubkey_valid(ctx, val)) { ret = -EAGAIN; goto err_free_val; } + + /* + * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance + * of Pair-wise Consistency"): recompute the + * public key and check if the results match. + */ + val_pct = mpi_alloc(0); + if (!val_pct) { + ret = -ENOMEM; + goto err_free_val; + } + + ret = _compute_val(ctx, base, val_pct); + if (ret) { + mpi_free(val_pct); + goto err_free_val; + } + + if (mpi_cmp(val, val_pct) != 0) { + mpi_free(val_pct); + panic("DH PCT failed in FIPS mode"); + } + mpi_free(val_pct); } } diff --git a/crypto/ecdh.c b/crypto/ecdh.c index 96f69cbbc38a3..c5e5e0c79235a 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "ecc.h" struct ecdh_ctx { @@ -53,6 +54,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, ctx->curve_id = params.curve_id; ctx->ndigits = ndigits; + memset(ctx->private_key, 0, sizeof(ctx->private_key)); + if (!params.key || !params.key_size) return ecc_gen_privkey(ctx->curve_id, ctx->ndigits, ctx->private_key); @@ -113,6 +116,36 @@ static int ecdh_compute_value(struct kpp_request *req) ctx->private_key, public_key); buf = public_key; nbytes = public_key_sz; + + /* + * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance of + * Pair-wise Consistency"): recompute the public key + * and check if the results match. + */ + if (fips_enabled) { + u64 *public_key_pct; + + if (ret < 0) + goto free_all; + + public_key_pct = kmalloc(public_key_sz, GFP_KERNEL); + if (!public_key_pct) { + ret = -ENOMEM; + goto free_all; + } + + ret = ecc_make_pub_key(ctx->curve_id, ctx->ndigits, + ctx->private_key, + public_key_pct); + if (ret < 0) { + kfree(public_key_pct); + goto free_all; + } + + if (memcmp(public_key, public_key_pct, public_key_sz)) + panic("ECDH PCT failed in FIPS mode"); + kfree(public_key_pct); + } } if (ret < 0) diff --git a/crypto/essiv.c b/crypto/essiv.c index a8befc8fb06ed..e1707538392e3 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -127,7 +127,7 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?: crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt); if (err) - return err; + goto out; crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & @@ -137,6 +137,8 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_aead_set_flags(tfm, crypto_cipher_get_flags(tctx->essiv_cipher) & CRYPTO_TFM_RES_MASK); +out: + memzero_explicit(&keys, sizeof(keys)); return err; } diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c index 5b39c0c6a7c31..e878ff7585349 100644 --- a/crypto/jitterentropy-kcapi.c +++ b/crypto/jitterentropy-kcapi.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Linux Kernel Crypto API specific code * - * Copyright Stephan Mueller , 2015 + * Copyright Stephan Mueller , 2015 - 2023 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +37,8 @@ * DAMAGE. */ +#include +#include #include #include #include @@ -46,6 +48,8 @@ #include "jitterentropy.h" +#define JENT_CONDITIONING_HASH "sha3-256-generic" + /*************************************************************************** * Helper function ***************************************************************************/ @@ -60,11 +64,6 @@ void jent_zfree(void *ptr) kfree_sensitive(ptr); } -void jent_memcpy(void *dest, const void *src, unsigned int n) -{ - memcpy(dest, src, n); -} - /* * Obtain a high-resolution time stamp value. The time stamp is used to measure * the execution time of a given code path and its variations. Hence, the time @@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out) *out = tmp; } +int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm); + u8 intermediary[SHA3_256_DIGEST_SIZE]; + __u64 j = 0; + int ret; + + desc->tfm = hash_state_desc->tfm; + + if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) { + pr_warn_ratelimited("Unexpected digest size\n"); + return -EINVAL; + } + + /* + * This loop fills a buffer which is injected into the entropy pool. + * The main reason for this loop is to execute something over which we + * can perform a timing measurement. The injection of the resulting + * data into the pool is performed to ensure the result is used and + * the compiler cannot optimize the loop away in case the result is not + * used at all. Yet that data is considered "additional information" + * considering the terminology from SP800-90A without any entropy. + * + * Note, it does not matter which or how much data you inject, we are + * interested in one Keccack1600 compression operation performed with + * the crypto_shash_final. + */ + for (j = 0; j < hash_loop_cnt; j++) { + ret = crypto_shash_init(desc) ?: + crypto_shash_update(desc, intermediary, + sizeof(intermediary)) ?: + crypto_shash_finup(desc, addtl, addtl_len, intermediary); + if (ret) + goto err; + } + + /* + * Inject the data from the previous loop into the pool. This data is + * not considered to contain any entropy, but it stirs the pool a bit. + */ + ret = crypto_shash_update(desc, intermediary, sizeof(intermediary)); + if (ret) + goto err; + + /* + * Insert the time stamp into the hash context representing the pool. + * + * If the time stamp is stuck, do not finally insert the value into the + * entropy pool. Although this operation should not do any harm even + * when the time stamp has no entropy, SP800-90B requires that any + * conditioning operation to have an identical amount of input data + * according to section 3.1.5. + */ + if (!stuck) { + ret = crypto_shash_update(hash_state_desc, (u8 *)&time, + sizeof(__u64)); + } + +err: + shash_desc_zero(desc); + memzero_explicit(intermediary, sizeof(intermediary)); + + return ret; +} + +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + u8 jent_block[SHA3_256_DIGEST_SIZE]; + /* Obtain data from entropy pool and re-initialize it */ + int ret = crypto_shash_final(hash_state_desc, jent_block) ?: + crypto_shash_init(hash_state_desc) ?: + crypto_shash_update(hash_state_desc, jent_block, + sizeof(jent_block)); + + if (!ret && dst_len) + memcpy(dst, jent_block, dst_len); + + memzero_explicit(jent_block, sizeof(jent_block)); + return ret; +} + /*************************************************************************** * Kernel crypto API interface ***************************************************************************/ @@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out) struct jitterentropy { spinlock_t jent_lock; struct rand_data *entropy_collector; + struct crypto_shash *tfm; + struct shash_desc *sdesc; }; -static int jent_kcapi_init(struct crypto_tfm *tfm) +static void jent_kcapi_cleanup(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); - int ret = 0; - rng->entropy_collector = jent_entropy_collector_alloc(1, 0); - if (!rng->entropy_collector) - ret = -ENOMEM; + spin_lock(&rng->jent_lock); - spin_lock_init(&rng->jent_lock); - return ret; -} + if (rng->sdesc) { + shash_desc_zero(rng->sdesc); + kfree(rng->sdesc); + } + rng->sdesc = NULL; -static void jent_kcapi_cleanup(struct crypto_tfm *tfm) -{ - struct jitterentropy *rng = crypto_tfm_ctx(tfm); + if (rng->tfm) + crypto_free_shash(rng->tfm); + rng->tfm = NULL; - spin_lock(&rng->jent_lock); if (rng->entropy_collector) jent_entropy_collector_free(rng->entropy_collector); rng->entropy_collector = NULL; spin_unlock(&rng->jent_lock); } +static int jent_kcapi_init(struct crypto_tfm *tfm) +{ + struct jitterentropy *rng = crypto_tfm_ctx(tfm); + struct crypto_shash *hash; + struct shash_desc *sdesc; + int size, ret = 0; + + spin_lock_init(&rng->jent_lock); + + /* + * Use SHA3-256 as conditioner. We allocate only the generic + * implementation as we are not interested in high-performance. The + * execution time of the SHA3 operation is measured and adds to the + * Jitter RNG's unpredictable behavior. If we have a slower hash + * implementation, the execution timing variations are larger. When + * using a fast implementation, we would need to call it more often + * as its variations are lower. + */ + hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(hash)) { + pr_err("Cannot allocate conditioning digest\n"); + return PTR_ERR(hash); + } + rng->tfm = hash; + + size = sizeof(struct shash_desc) + crypto_shash_descsize(hash); + sdesc = kmalloc(size, GFP_KERNEL); + if (!sdesc) { + ret = -ENOMEM; + goto err; + } + + sdesc->tfm = hash; + crypto_shash_init(sdesc); + rng->sdesc = sdesc; + + rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc); + if (!rng->entropy_collector) { + ret = -ENOMEM; + goto err; + } + + spin_lock_init(&rng->jent_lock); + return 0; + +err: + jent_kcapi_cleanup(tfm); + return ret; +} + static int jent_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) @@ -180,15 +314,24 @@ static struct rng_alg jent_alg = { .cra_module = THIS_MODULE, .cra_init = jent_kcapi_init, .cra_exit = jent_kcapi_cleanup, - } }; static int __init jent_mod_init(void) { + SHASH_DESC_ON_STACK(desc, tfm); + struct crypto_shash *tfm; int ret = 0; - ret = jent_entropy_init(); + tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + desc->tfm = tfm; + crypto_shash_init(desc); + ret = jent_entropy_init(desc); + shash_desc_zero(desc); + crypto_free_shash(tfm); if (ret) { /* Handle permanent health test error */ if (fips_enabled) diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index 1b0377e6efa0b..c18fc9fd43d0a 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * - * Copyright Stephan Mueller , 2015 - 2020 + * Copyright Stephan Mueller , 2015 - 2023 * * Design * ====== @@ -57,21 +57,22 @@ typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; +typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { + /* SHA3-256 is used as conditioner */ +#define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ - __u64 data; /* SENSITIVE Actual random number */ - __u64 old_data; /* SENSITIVE Previous random number */ - __u64 prev_time; /* SENSITIVE Previous time stamp */ -#define DATA_SIZE_BITS ((sizeof(__u64)) * 8) - __u64 last_delta; /* SENSITIVE stuck test */ - __s64 last_delta2; /* SENSITIVE stuck test */ - unsigned int osr; /* Oversample rate */ + void *hash_state; /* SENSITIVE hash state entropy pool */ + __u64 prev_time; /* SENSITIVE Previous time stamp */ + __u64 last_delta; /* SENSITIVE stuck test */ + __s64 last_delta2; /* SENSITIVE stuck test */ + unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_BLOCKS 64 #define JENT_MEMORY_BLOCKSIZE 32 #define JENT_MEMORY_ACCESSLOOPS 128 @@ -118,6 +119,22 @@ struct rand_data { #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ +/* + * The output n bits can receive more than n bits of min entropy, of course, + * but the fixed output of the conditioning function can only asymptotically + * approach the output size bits of min entropy, not attain that bound. Random + * maps will tend to have output collisions, which reduces the creditable + * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). + * + * The value "64" is justified in Appendix A.4 of the current 90C draft, + * and aligns with NIST's in "epsilon" definition in this document, which is + * that a string can be considered "full entropy" if you can bound the min + * entropy in each bit of output to at least 1-epsilon, where epsilon is + * required to be <= 2^(-32). + */ +#define JENT_ENTROPY_SAFETY_FACTOR 64 + +#include #include "jitterentropy.h" /*************************************************************************** @@ -285,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec) * an entropy collection. * * Input: - * @ec entropy collector struct -- may be NULL * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ -static __u64 jent_loop_shuffle(struct rand_data *ec, - unsigned int bits, unsigned int min) +static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; @@ -301,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, unsigned int mask = (1<data; + /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. @@ -328,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, * execution time jitter * * This function injects the individual bits of the time value into the - * entropy pool using an LFSR. + * entropy pool using a hash. * - * The code is deliberately inefficient with respect to the bit shifting - * and shall stay that way. This function is the root cause why the code - * shall be compiled without optimization. This function not only acts as - * folding operation, but this function's execution is used to measure - * the CPU execution time jitter. Any change to the loop in this function - * implies that careful retesting must be done. - * - * @ec [in] entropy collector struct - * @time [in] time stamp to be injected - * @loop_cnt [in] if a value not equal to 0 is set, use the given value as - * number of loops to perform the folding - * @stuck [in] Is the time stamp identified as stuck? + * ec [in] entropy collector + * time [in] time stamp to be injected + * stuck [in] Is the time stamp identified as stuck? * * Output: - * updated ec->data - * - * @return Number of loops the folding operation is performed + * updated hash context in the entropy collector or error code */ -static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt, - int stuck) +static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { - unsigned int i; - __u64 j = 0; - __u64 new = 0; -#define MAX_FOLD_LOOP_BIT 4 -#define MIN_FOLD_LOOP_BIT 0 - __u64 fold_loop_cnt = - jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT); - - /* - * testing purposes -- allow test app to set the counter, not - * needed during runtime - */ - if (loop_cnt) - fold_loop_cnt = loop_cnt; - for (j = 0; j < fold_loop_cnt; j++) { - new = ec->data; - for (i = 1; (DATA_SIZE_BITS) >= i; i++) { - __u64 tmp = time << (DATA_SIZE_BITS - i); - - tmp = tmp >> (DATA_SIZE_BITS - 1); - - /* - * Fibonacci LSFR with polynomial of - * x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is - * primitive according to - * http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf - * (the shift values are the polynomial values minus one - * due to counting bits from 0 to 63). As the current - * position is always the LSB, the polynomial only needs - * to shift data in from the left without wrap. - */ - tmp ^= ((new >> 63) & 1); - tmp ^= ((new >> 60) & 1); - tmp ^= ((new >> 55) & 1); - tmp ^= ((new >> 30) & 1); - tmp ^= ((new >> 27) & 1); - tmp ^= ((new >> 22) & 1); - new <<= 1; - new ^= tmp; - } - } - - /* - * If the time stamp is stuck, do not finally insert the value into - * the entropy pool. Although this operation should not do any harm - * even when the time stamp has no entropy, SP800-90B requires that - * any conditioning operation (SP800-90B considers the LFSR to be a - * conditioning operation) to have an identical amount of input - * data according to section 3.1.5. - */ - if (!stuck) - ec->data = new; +#define SHA3_HASH_LOOP (1<<3) + struct { + int rct_count; + unsigned int apt_observations; + unsigned int apt_count; + unsigned int apt_base; + } addtl = { + ec->rct_count, + ec->apt_observations, + ec->apt_count, + ec->apt_base + }; + + return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), + SHA3_HASH_LOOP, stuck); } /* @@ -436,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = - jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); + jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; @@ -504,20 +465,24 @@ static int jent_measure_jitter(struct rand_data *ec) stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ - jent_lfsr_time(ec, current_delta, 0, stuck); + if (jent_condition_data(ec, current_delta, stuck)) + stuck = 1; return stuck; } /* * Generator of one 64 bit random number - * Function fills rand_data->data + * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { - unsigned int k = 0; + unsigned int k = 0, safety_factor = 0; + + if (fips_enabled) + safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec); @@ -531,7 +496,7 @@ static void jent_gen_entropy(struct rand_data *ec) * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ - if (++k >= (DATA_SIZE_BITS * ec->osr)) + if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } @@ -555,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec) * @return 0 when request is fulfilled or an error * * The following error codes can occur: - * -1 entropy_collector is NULL + * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ @@ -585,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, * Perform startup health tests and return permanent * error if it fails. */ - if (jent_entropy_init()) + if (jent_entropy_init(ec->hash_state)) return -3; return -2; @@ -595,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, tocopy = (DATA_SIZE_BITS / 8); else tocopy = len; - jent_memcpy(p, &ec->data, tocopy); + if (jent_read_random_block(ec->hash_state, p, tocopy)) + return -1; len -= tocopy; p += tocopy; @@ -609,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags) + unsigned int flags, + void *hash_state) { struct rand_data *entropy_collector; @@ -636,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, osr = 1; /* minimum sampling rate is 1 */ entropy_collector->osr = osr; + entropy_collector->hash_state = hash_state; + /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); @@ -649,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector) jent_zfree(entropy_collector); } -int jent_entropy_init(void) +int jent_entropy_init(void *hash_state) { int i; __u64 delta_sum = 0; @@ -662,6 +631,7 @@ int jent_entropy_init(void) /* Required for RCT */ ec.osr = 1; + ec.hash_state = hash_state; /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These @@ -699,7 +669,7 @@ int jent_entropy_init(void) /* Invoke core entropy collection logic */ jent_get_nstime(&time); ec.prev_time = time; - jent_lfsr_time(&ec, time, 0, 0); + jent_condition_data(&ec, time, 0); jent_get_nstime(&time2); /* test whether timer works */ diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h index 5cc583f6bc6b8..b3890ff26a023 100644 --- a/crypto/jitterentropy.h +++ b/crypto/jitterentropy.h @@ -2,14 +2,18 @@ extern void *jent_zalloc(unsigned int len); extern void jent_zfree(void *ptr); -extern void jent_memcpy(void *dest, const void *src, unsigned int n); extern void jent_get_nstime(__u64 *out); +extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck); +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len); struct rand_data; -extern int jent_entropy_init(void); +extern int jent_entropy_init(void *hash_state); extern int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len); extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags); + unsigned int flags, + void *hash_state); extern void jent_entropy_collector_free(struct rand_data *entropy_collector); diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index faca092456fa3..3132705aa192f 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -358,8 +358,8 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, sizeof(struct gdma_create_dma_region_resp)); create_req->length = umem->length; - create_req->offset_in_page = umem->address & (page_sz - 1); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", @@ -460,13 +460,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 3cf493a51b8d0..e761f12d8901d 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2023 Intel Corporation */ +#include + #include "idpf.h" /** @@ -176,6 +178,58 @@ static int idpf_tx_singleq_csum(struct sk_buff *skb, return 1; } +/** + * idpf_tx_singleq_dma_map_error - handle TX DMA map errors + * @txq: queue to send buffer on + * @skb: send buffer + * @first: original first buffer info buffer for packet + * @idx: starting point on ring to unwind + */ +static void idpf_tx_singleq_dma_map_error(struct idpf_tx_queue *txq, + struct sk_buff *skb, + struct idpf_tx_buf *first, u16 idx) +{ + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + /* clear dma mappings for failed tx_buf map */ + for (;;) { + struct idpf_tx_buf *tx_buf; + + tx_buf = &txq->tx_buf[idx]; + libeth_tx_complete(tx_buf, &cp); + if (tx_buf == first) + break; + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + if (skb_is_gso(skb)) { + union idpf_tx_flex_desc *tx_desc; + + /* If we failed a DMA mapping for a TSO packet, we will have + * used one additional descriptor for a context + * descriptor. Reset that here. + */ + tx_desc = &txq->flex_tx[idx]; + memset(tx_desc, 0, sizeof(*tx_desc)); + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + /* Update tail in case netdev_xmit_more was previously true */ + idpf_tx_buf_hw_update(txq, idx, false); +} + /** * idpf_tx_singleq_map - Build the Tx base descriptor * @tx_q: queue to send buffer on @@ -216,12 +270,14 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_singleq_dma_map_error(tx_q, skb, + first, i); /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); dma_unmap_addr_set(tx_buf, dma, dma); + tx_buf->type = LIBETH_SQE_FRAG; /* align size to end of page */ max_data += -dma & (IDPF_TX_MAX_READ_REQ_SIZE - 1); @@ -235,14 +291,17 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, offsets, max_data, td_tag); - tx_desc++; - i++; - - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = &tx_q->tx_buf[0]; tx_desc = &tx_q->base_tx[0]; i = 0; + } else { + tx_buf++; + tx_desc++; } + tx_buf->type = LIBETH_SQE_EMPTY; + dma += max_data; size -= max_data; @@ -255,12 +314,14 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets, size, td_tag); - tx_desc++; - i++; - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = &tx_q->tx_buf[0]; tx_desc = &tx_q->base_tx[0]; i = 0; + } else { + tx_buf++; + tx_desc++; } size = skb_frag_size(frag); @@ -268,8 +329,6 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, dma = skb_frag_dma_map(tx_q->dev, frag, 0, size, DMA_TO_DEVICE); - - tx_buf = &tx_q->tx_buf[i]; } skb_tx_timestamp(first->skb); @@ -280,13 +339,13 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets, size, td_tag); - IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i); + first->type = LIBETH_SQE_SKB; + first->rs_idx = i; - /* set next_to_watch value indicating a packet is present */ - first->next_to_watch = tx_desc; + IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i); nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - netdev_tx_sent_queue(nq, first->bytecount); + netdev_tx_sent_queue(nq, first->bytes); idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more()); } @@ -304,8 +363,7 @@ idpf_tx_singleq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_base_tx_ctx_desc *ctx_desc; int ntu = txq->next_to_use; - memset(&txq->tx_buf[ntu], 0, sizeof(struct idpf_tx_buf)); - txq->tx_buf[ntu].ctx_entry = true; + txq->tx_buf[ntu].type = LIBETH_SQE_CTX; ctx_desc = &txq->base_ctx[ntu]; @@ -356,11 +414,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, { struct idpf_tx_offload_params offload = { }; struct idpf_tx_buf *first; + u32 count, buf_count = 1; int csum, tso, needed; - unsigned int count; __be16 protocol; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); @@ -399,11 +457,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, first->skb = skb; if (tso) { - first->gso_segs = offload.tso_segs; - first->bytecount = skb->len + ((first->gso_segs - 1) * offload.tso_hdr_len); + first->packets = offload.tso_segs; + first->bytes = skb->len + ((first->packets - 1) * offload.tso_hdr_len); } else { - first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); - first->gso_segs = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + first->packets = 1; } idpf_tx_singleq_map(tx_q, first, &offload); @@ -423,10 +481,15 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, int *cleaned) { - unsigned int total_bytes = 0, total_pkts = 0; + struct libeth_sq_napi_stats ss = { }; struct idpf_base_tx_desc *tx_desc; u32 budget = tx_q->clean_budget; s16 ntc = tx_q->next_to_clean; + struct libeth_cq_pp cp = { + .dev = tx_q->dev, + .ss = &ss, + .napi = napi_budget, + }; struct idpf_netdev_priv *np; struct idpf_tx_buf *tx_buf; struct netdev_queue *nq; @@ -444,47 +507,26 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, * such. We can skip this descriptor since there is no buffer * to clean. */ - if (tx_buf->ctx_entry) { - /* Clear this flag here to avoid stale flag values when - * this buffer is used for actual data in the future. - * There are cases where the tx_buf struct / the flags - * field will not be cleared before being reused. - */ - tx_buf->ctx_entry = false; + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) { + tx_buf->type = LIBETH_SQE_EMPTY; goto fetch_next_txq_desc; } - /* if next_to_watch is not set then no work pending */ - eop_desc = (struct idpf_base_tx_desc *)tx_buf->next_to_watch; - if (!eop_desc) + if (unlikely(tx_buf->type != LIBETH_SQE_SKB)) break; - /* prevent any other reads prior to eop_desc */ + /* prevent any other reads prior to type */ smp_rmb(); + eop_desc = &tx_q->base_tx[tx_buf->rs_idx]; + /* if the descriptor isn't done, no work yet to do */ if (!(eop_desc->qw1 & cpu_to_le64(IDPF_TX_DESC_DTYPE_DESC_DONE))) break; - /* clear next_to_watch to prevent false hangs */ - tx_buf->next_to_watch = NULL; - /* update the statistics for this packet */ - total_bytes += tx_buf->bytecount; - total_pkts += tx_buf->gso_segs; - - napi_consume_skb(tx_buf->skb, napi_budget); - - /* unmap skb header data */ - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - - /* clear tx_buf data */ - tx_buf->skb = NULL; - dma_unmap_len_set(tx_buf, len, 0); + libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ while (tx_desc != eop_desc) { @@ -498,13 +540,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, } /* unmap any remaining paged data */ - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } + libeth_tx_complete(tx_buf, &cp); } /* update budget only if we did something */ @@ -524,11 +560,11 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, ntc += tx_q->desc_count; tx_q->next_to_clean = ntc; - *cleaned += total_pkts; + *cleaned += ss.packets; u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_add(&tx_q->q_stats.packets, total_pkts); - u64_stats_add(&tx_q->q_stats.bytes, total_bytes); + u64_stats_add(&tx_q->q_stats.packets, ss.packets); + u64_stats_add(&tx_q->q_stats.bytes, ss.bytes); u64_stats_update_end(&tx_q->stats_sync); np = netdev_priv(tx_q->netdev); @@ -536,7 +572,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, dont_wake = np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); - __netif_txq_completed_wake(nq, total_pkts, total_bytes, + __netif_txq_completed_wake(nq, ss.packets, ss.bytes, IDPF_DESC_UNUSED(tx_q), IDPF_TX_WAKE_THRESH, dont_wake); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index e163e54d1c31e..24b255145f57c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -1,42 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2023 Intel Corporation */ +#include + #include "idpf.h" #include "idpf_virtchnl.h" +#define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) +LIBETH_SQE_CHECK_PRIV(u32); + static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); -/** - * idpf_buf_lifo_push - push a buffer pointer onto stack - * @stack: pointer to stack struct - * @buf: pointer to buf to push - * - * Returns 0 on success, negative on failure - **/ -static int idpf_buf_lifo_push(struct idpf_buf_lifo *stack, - struct idpf_tx_stash *buf) -{ - if (unlikely(stack->top == stack->size)) - return -ENOSPC; - - stack->bufs[stack->top++] = buf; - - return 0; -} - -/** - * idpf_buf_lifo_pop - pop a buffer pointer from stack - * @stack: pointer to stack struct - **/ -static struct idpf_tx_stash *idpf_buf_lifo_pop(struct idpf_buf_lifo *stack) -{ - if (unlikely(!stack->top)) - return NULL; - - return stack->bufs[--stack->top]; -} - /** * idpf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure @@ -58,66 +33,29 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) } } -/** - * idpf_tx_buf_rel - Release a Tx buffer - * @tx_q: the queue that owns the buffer - * @tx_buf: the buffer to free - */ -static void idpf_tx_buf_rel(struct idpf_tx_queue *tx_q, - struct idpf_tx_buf *tx_buf) -{ - if (tx_buf->skb) { - if (dma_unmap_len(tx_buf, len)) - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_buf->skb); - } else if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - } - - tx_buf->next_to_watch = NULL; - tx_buf->skb = NULL; - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - dma_unmap_len_set(tx_buf, len, 0); -} - /** * idpf_tx_buf_rel_all - Free any empty Tx buffers * @txq: queue to be cleaned */ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { - struct idpf_buf_lifo *buf_stack; - u16 i; + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + u32 i; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) return; /* Free all the Tx buffer sk_buffs */ - for (i = 0; i < txq->desc_count; i++) - idpf_tx_buf_rel(txq, &txq->tx_buf[i]); + for (i = 0; i < txq->buf_pool_size; i++) + libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); txq->tx_buf = NULL; - - if (!idpf_queue_has(FLOW_SCH_EN, txq)) - return; - - buf_stack = &txq->stash->buf_stack; - if (!buf_stack->bufs) - return; - - for (i = 0; i < buf_stack->size; i++) - kfree(buf_stack->bufs[i]); - - kfree(buf_stack->bufs); - buf_stack->bufs = NULL; } /** @@ -129,10 +67,14 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) { idpf_tx_buf_rel_all(txq); + netdev_tx_reset_subqueue(txq->netdev, txq->idx); if (!txq->desc_ring) return; + if (txq->refillq) + kfree(txq->refillq->ring); + dmam_free_coherent(txq->dev, txq->size, txq->desc_ring, txq->dma); txq->desc_ring = NULL; txq->next_to_use = 0; @@ -189,45 +131,18 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) */ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { - struct idpf_buf_lifo *buf_stack; - int buf_size; - int i; - /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ - buf_size = sizeof(struct idpf_tx_buf) * tx_q->desc_count; - tx_q->tx_buf = kzalloc(buf_size, GFP_KERNEL); + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) + tx_q->buf_pool_size = U16_MAX; + else + tx_q->buf_pool_size = tx_q->desc_count; + tx_q->tx_buf = kcalloc(tx_q->buf_pool_size, sizeof(*tx_q->tx_buf), + GFP_KERNEL); if (!tx_q->tx_buf) return -ENOMEM; - /* Initialize tx_bufs with invalid completion tags */ - for (i = 0; i < tx_q->desc_count; i++) - tx_q->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) - return 0; - - buf_stack = &tx_q->stash->buf_stack; - - /* Initialize tx buf stack for out-of-order completions if - * flow scheduling offload is enabled - */ - buf_stack->bufs = kcalloc(tx_q->desc_count, sizeof(*buf_stack->bufs), - GFP_KERNEL); - if (!buf_stack->bufs) - return -ENOMEM; - - buf_stack->size = tx_q->desc_count; - buf_stack->top = tx_q->desc_count; - - for (i = 0; i < tx_q->desc_count; i++) { - buf_stack->bufs[i] = kzalloc(sizeof(*buf_stack->bufs[i]), - GFP_KERNEL); - if (!buf_stack->bufs[i]) - return -ENOMEM; - } - return 0; } @@ -242,7 +157,9 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, struct idpf_tx_queue *tx_q) { struct device *dev = tx_q->dev; + struct idpf_sw_queue *refillq; int err; + unsigned int i = 0; err = idpf_tx_buf_alloc_all(tx_q); if (err) @@ -265,6 +182,31 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, tx_q->next_to_clean = 0; idpf_queue_set(GEN_CHK, tx_q); + if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) + return 0; + + refillq = tx_q->refillq; + refillq->desc_count = tx_q->buf_pool_size; + refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), + GFP_KERNEL); + if (!refillq->ring) { + err = -ENOMEM; + goto err_alloc; + } + + for (i = 0; i < refillq->desc_count; i++) + refillq->ring[i] = + FIELD_PREP(IDPF_RFL_BI_BUFID_M, i) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, + idpf_queue_has(GEN_CHK, refillq)); + + /* Go ahead and flip the GEN bit since this counts as filling + * up the ring, i.e. we already ring wrapped. + */ + idpf_queue_change(GEN_CHK, refillq); + + tx_q->last_re = tx_q->desc_count - IDPF_TX_SPLITQ_RE_MIN_GAP; + return 0; err_alloc: @@ -315,8 +257,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) for (i = 0; i < vport->num_txq_grp; i++) { for (j = 0; j < vport->txq_grps[i].num_txq; j++) { struct idpf_tx_queue *txq = vport->txq_grps[i].txqs[j]; - u8 gen_bits = 0; - u16 bufidx_mask; err = idpf_tx_desc_alloc(vport, txq); if (err) { @@ -325,34 +265,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) i); goto err_out; } - - if (!idpf_is_queue_model_split(vport->txq_model)) - continue; - - txq->compl_tag_cur_gen = 0; - - /* Determine the number of bits in the bufid - * mask and add one to get the start of the - * generation bits - */ - bufidx_mask = txq->desc_count - 1; - while (bufidx_mask >> 1) { - txq->compl_tag_gen_s++; - bufidx_mask = bufidx_mask >> 1; - } - txq->compl_tag_gen_s++; - - gen_bits = IDPF_TX_SPLITQ_COMPL_TAG_WIDTH - - txq->compl_tag_gen_s; - txq->compl_tag_gen_max = GETMAXVAL(gen_bits); - - /* Set bufid mask based on location of first - * gen bit; it cannot simply be the descriptor - * ring size-1 since we can have size values - * where not all of those bits are set. - */ - txq->compl_tag_bufid_m = - GETMAXVAL(txq->compl_tag_gen_s); } if (!idpf_is_queue_model_split(vport->txq_model)) @@ -586,18 +498,18 @@ static int idpf_rx_hdr_buf_alloc_all(struct idpf_buf_queue *bufq) } /** - * idpf_rx_post_buf_refill - Post buffer id to refill queue + * idpf_post_buf_refill - Post buffer id to refill queue * @refillq: refill queue to post to * @buf_id: buffer id to post */ -static void idpf_rx_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) +static void idpf_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) { u32 nta = refillq->next_to_use; /* store the buffer ID and the SW maintained GEN bit to the refillq */ refillq->ring[nta] = - FIELD_PREP(IDPF_RX_BI_BUFID_M, buf_id) | - FIELD_PREP(IDPF_RX_BI_GEN_M, + FIELD_PREP(IDPF_RFL_BI_BUFID_M, buf_id) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, idpf_queue_has(GEN_CHK, refillq)); if (unlikely(++nta == refillq->desc_count)) { @@ -977,6 +889,11 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) struct idpf_txq_group *txq_grp = &vport->txq_grps[i]; for (j = 0; j < txq_grp->num_txq; j++) { + if (flow_sch_en) { + kfree(txq_grp->txqs[j]->refillq); + txq_grp->txqs[j]->refillq = NULL; + } + kfree(txq_grp->txqs[j]); txq_grp->txqs[j] = NULL; } @@ -986,9 +903,6 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) kfree(txq_grp->complq); txq_grp->complq = NULL; - - if (flow_sch_en) - kfree(txq_grp->stashes); } kfree(vport->txq_grps); vport->txq_grps = NULL; @@ -1347,7 +1261,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) for (i = 0; i < vport->num_txq_grp; i++) { struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i]; struct idpf_adapter *adapter = vport->adapter; - struct idpf_txq_stash *stashes; int j; tx_qgrp->vport = vport; @@ -1360,15 +1273,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) goto err_alloc; } - if (split && flow_sch_en) { - stashes = kcalloc(num_txq, sizeof(*stashes), - GFP_KERNEL); - if (!stashes) - goto err_alloc; - - tx_qgrp->stashes = stashes; - } - for (j = 0; j < tx_qgrp->num_txq; j++) { struct idpf_tx_queue *q = tx_qgrp->txqs[j]; @@ -1388,12 +1292,14 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) if (!flow_sch_en) continue; - if (split) { - q->stash = &stashes[j]; - hash_init(q->stash->sched_buf_hash); - } - idpf_queue_set(FLOW_SCH_EN, q); + + q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); + if (!q->refillq) + goto err_alloc; + + idpf_queue_set(GEN_CHK, q->refillq); + idpf_queue_set(RFL_GEN_CHK, q->refillq); } if (!split) @@ -1651,125 +1557,10 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q) wake_up(&vport->sw_marker_wq); } -/** - * idpf_tx_splitq_clean_hdr - Clean TX buffer resources for header portion of - * packet - * @tx_q: tx queue to clean buffer from - * @tx_buf: buffer to be cleaned - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @napi_budget: Used to determine if we are in netpoll - */ -static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q, - struct idpf_tx_buf *tx_buf, - struct idpf_cleaned_stats *cleaned, - int napi_budget) -{ - napi_consume_skb(tx_buf->skb, napi_budget); - - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - - dma_unmap_len_set(tx_buf, len, 0); - } - - /* clear tx_buf data */ - tx_buf->skb = NULL; - - cleaned->bytes += tx_buf->bytecount; - cleaned->packets += tx_buf->gso_segs; -} - -/** - * idpf_tx_clean_stashed_bufs - clean bufs that were stored for - * out of order completions - * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @budget: Used to determine if we are in netpoll - */ -static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, - u16 compl_tag, - struct idpf_cleaned_stats *cleaned, - int budget) -{ - struct idpf_tx_stash *stash; - struct hlist_node *tmp_buf; - - /* Buffer completion */ - hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, - hlist, compl_tag) { - if (unlikely(stash->buf.compl_tag != (int)compl_tag)) - continue; - - if (stash->buf.skb) { - idpf_tx_splitq_clean_hdr(txq, &stash->buf, cleaned, - budget); - } else if (dma_unmap_len(&stash->buf, len)) { - dma_unmap_page(txq->dev, - dma_unmap_addr(&stash->buf, dma), - dma_unmap_len(&stash->buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(&stash->buf, len, 0); - } - - /* Push shadow buf back onto stack */ - idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - - hash_del(&stash->hlist); - } -} - -/** - * idpf_stash_flow_sch_buffers - store buffer parameters info to be freed at a - * later time (only relevant for flow scheduling mode) - * @txq: Tx queue to clean - * @tx_buf: buffer to store - */ -static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, - struct idpf_tx_buf *tx_buf) -{ - struct idpf_tx_stash *stash; - - if (unlikely(!dma_unmap_addr(tx_buf, dma) && - !dma_unmap_len(tx_buf, len))) - return 0; - - stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); - if (unlikely(!stash)) { - net_err_ratelimited("%s: No out-of-order TX buffers left!\n", - netdev_name(txq->netdev)); - - return -ENOMEM; - } - - /* Store buffer params in shadow buffer */ - stash->buf.skb = tx_buf->skb; - stash->buf.bytecount = tx_buf->bytecount; - stash->buf.gso_segs = tx_buf->gso_segs; - dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); - dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - stash->buf.compl_tag = tx_buf->compl_tag; - - /* Add buffer to buf_hash table to be freed later */ - hash_add(txq->stash->sched_buf_hash, &stash->hlist, - stash->buf.compl_tag); - - memset(tx_buf, 0, sizeof(struct idpf_tx_buf)); - - /* Reinitialize buf_id portion of tag */ - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - - return 0; -} - #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ - (ntc)++; \ - if (unlikely(!(ntc))) { \ - ntc -= (txq)->desc_count; \ + if (unlikely(++(ntc) == (txq)->desc_count)) { \ + ntc = 0; \ buf = (txq)->tx_buf; \ desc = &(txq)->flex_tx[0]; \ } else { \ @@ -1796,156 +1587,95 @@ do { \ */ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, bool descs_only) { union idpf_tx_flex_desc *next_pending_desc = NULL; union idpf_tx_flex_desc *tx_desc; - s16 ntc = tx_q->next_to_clean; + u32 ntc = tx_q->next_to_clean; + struct libeth_cq_pp cp = { + .dev = tx_q->dev, + .ss = cleaned, + .napi = napi_budget, + }; struct idpf_tx_buf *tx_buf; + if (descs_only) { + /* Bump ring index to mark as cleaned. */ + tx_q->next_to_clean = end; + return; + } + tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; tx_buf = &tx_q->tx_buf[ntc]; - ntc -= tx_q->desc_count; while (tx_desc != next_pending_desc) { - union idpf_tx_flex_desc *eop_desc; + u32 eop_idx; /* If this entry in the ring was used as a context descriptor, - * it's corresponding entry in the buffer ring will have an - * invalid completion tag since no buffer was used. We can - * skip this descriptor since there is no buffer to clean. + * it's corresponding entry in the buffer ring is reserved. We + * can skip this descriptor since there is no buffer to clean. */ - if (unlikely(tx_buf->compl_tag == IDPF_SPLITQ_TX_INVAL_COMPL_TAG)) + if (tx_buf->type <= LIBETH_SQE_CTX) goto fetch_next_txq_desc; - eop_desc = (union idpf_tx_flex_desc *)tx_buf->next_to_watch; - - /* clear next_to_watch to prevent false hangs */ - tx_buf->next_to_watch = NULL; + if (unlikely(tx_buf->type != LIBETH_SQE_SKB)) + break; - if (descs_only) { - if (idpf_stash_flow_sch_buffers(tx_q, tx_buf)) - goto tx_splitq_clean_out; + eop_idx = tx_buf->rs_idx; + libeth_tx_complete(tx_buf, &cp); - while (tx_desc != eop_desc) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); + /* unmap remaining buffers */ + while (ntc != eop_idx) { + idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, + tx_desc, tx_buf); - if (dma_unmap_len(tx_buf, len)) { - if (idpf_stash_flow_sch_buffers(tx_q, - tx_buf)) - goto tx_splitq_clean_out; - } - } - } else { - idpf_tx_splitq_clean_hdr(tx_q, tx_buf, cleaned, - napi_budget); - - /* unmap remaining buffers */ - while (tx_desc != eop_desc) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - - /* unmap any remaining paged data */ - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } - } + /* unmap any remaining paged data */ + libeth_tx_complete(tx_buf, &cp); } fetch_next_txq_desc: idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); } -tx_splitq_clean_out: - ntc += tx_q->desc_count; tx_q->next_to_clean = ntc; } -#define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ -do { \ - (buf)++; \ - (ntc)++; \ - if (unlikely((ntc) == (txq)->desc_count)) { \ - buf = (txq)->tx_buf; \ - ntc = 0; \ - } \ -} while (0) - /** - * idpf_tx_clean_buf_ring - clean flow scheduling TX queue buffers + * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) + * @buf_id: packet's starting buffer ID, from completion descriptor * @cleaned: pointer to stats struct to track cleaned packets/bytes * @budget: Used to determine if we are in netpoll * - * Cleans all buffers associated with the input completion tag either from the - * TX buffer ring or from the hash table if the buffers were previously - * stashed. Returns the byte/segment count for the cleaned packet associated - * this completion tag. + * Clean all buffers associated with the packet starting at buf_id. Returns the + * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct idpf_cleaned_stats *cleaned, - int budget) +static void idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, + struct libeth_sq_napi_stats *cleaned, + int budget) { - u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; - u16 ntc = txq->next_to_clean; - u16 num_descs_cleaned = 0; - u16 orig_idx = idx; - - tx_buf = &txq->tx_buf[idx]; - - while (tx_buf->compl_tag == (int)compl_tag) { - if (tx_buf->skb) { - idpf_tx_splitq_clean_hdr(txq, tx_buf, cleaned, budget); - } else if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(txq->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } - - memset(tx_buf, 0, sizeof(struct idpf_tx_buf)); - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = cleaned, + .napi = budget, + }; - num_descs_cleaned++; - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + tx_buf = &txq->tx_buf[buf_id]; + if (tx_buf->type == LIBETH_SQE_SKB) { + libeth_tx_complete(tx_buf, &cp); + idpf_post_buf_refill(txq->refillq, buf_id); } - /* If we didn't clean anything on the ring for this completion, there's - * nothing more to do. - */ - if (unlikely(!num_descs_cleaned)) - return false; + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + buf_id = idpf_tx_buf_next(tx_buf); - /* Otherwise, if we did clean a packet on the ring directly, it's safe - * to assume that the descriptors starting from the original - * next_to_clean up until the previously cleaned packet can be reused. - * Therefore, we will go back in the ring and stash any buffers still - * in the ring into the hash table to be cleaned later. - */ - tx_buf = &txq->tx_buf[ntc]; - while (tx_buf != &txq->tx_buf[orig_idx]) { - idpf_stash_flow_sch_buffers(txq, tx_buf); - idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); + tx_buf = &txq->tx_buf[buf_id]; + libeth_tx_complete(tx_buf, &cp); + idpf_post_buf_refill(txq->refillq, buf_id); } - - /* Finally, update next_to_clean to reflect the work that was just done - * on the ring, if any. If the packet was only cleaned from the hash - * table, the ring will not be impacted, therefore we should not touch - * next_to_clean. The updated idx is used here - */ - txq->next_to_clean = idx; - - return true; } /** @@ -1961,24 +1691,20 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, */ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct idpf_splitq_tx_compl_desc *desc, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { - u16 compl_tag; + /* RS completion contains queue head for queue based scheduling or + * completion tag for flow based scheduling. + */ + u16 rs_compl_val = le16_to_cpu(desc->q_head_compl_tag.q_head); if (!idpf_queue_has(FLOW_SCH_EN, txq)) { - u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - - return idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, rs_compl_val, budget, cleaned, false); + return; } - compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - - /* If we didn't clean anything on the ring, this packet must be - * in the hash table. Go clean it there. - */ - if (!idpf_tx_clean_buf_ring(txq, compl_tag, cleaned, budget)) - idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); + idpf_tx_clean_bufs(txq, rs_compl_val, cleaned, budget); } /** @@ -2004,7 +1730,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, ntc -= complq->desc_count; do { - struct idpf_cleaned_stats cleaned_stats = { }; + struct libeth_sq_napi_stats cleaned_stats = { }; struct idpf_tx_queue *tx_q; int rel_tx_qid; u16 hw_head; @@ -2095,8 +1821,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, /* Update BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - dont_wake = !complq_ok || IDPF_TX_BUF_RSV_LOW(tx_q) || - np->state != __IDPF_VPORT_UP || + dont_wake = !complq_ok || np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); /* Check if the TXQ needs to and can be restarted */ __netif_txq_completed_wake(nq, tx_q->cleaned_pkts, tx_q->cleaned_bytes, @@ -2153,15 +1878,21 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag); } -/* Global conditions to tell whether the txq (and related resources) - * has room to allow the use of "size" descriptors. +/** + * idpf_tx_splitq_has_room - check if enough Tx splitq resources are available + * @tx_q: the queue to be checked + * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of Tx buffers required for this packet + * + * Return: 0 if no room available, 1 otherwise */ -static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) +static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, + u32 bufs_needed) { - if (IDPF_DESC_UNUSED(tx_q) < size || + if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q)) + idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; } @@ -2170,14 +1901,21 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions * @tx_q: the queue to be checked * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of buffers needed for this packet * - * Returns 0 if stop is not needed + * Return: 0 if stop is not needed */ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q, - unsigned int descs_needed) + u32 descs_needed, + u32 bufs_needed) { + /* Since we have multiple resources to check for splitq, our + * start,stop_thrs becomes a boolean check instead of a count + * threshold. + */ if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, - idpf_txq_has_room(tx_q, descs_needed), + idpf_txq_has_room(tx_q, descs_needed, + bufs_needed), 1, 1)) return 0; @@ -2219,14 +1957,16 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, } /** - * idpf_tx_desc_count_required - calculate number of Tx descriptors needed + * idpf_tx_res_count_required - get number of Tx resources needed for this pkt * @txq: queue to send buffer on * @skb: send buffer + * @bufs_needed: (output) number of buffers needed for this skb. * - * Returns number of data descriptors needed for this skb. + * Return: number of data descriptors and buffers needed for this skb. */ -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb) +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, + u32 *bufs_needed) { const struct skb_shared_info *shinfo; unsigned int count = 0, i; @@ -2237,6 +1977,7 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; shinfo = skb_shinfo(skb); + *bufs_needed += shinfo->nr_frags; for (i = 0; i < shinfo->nr_frags; i++) { unsigned int size; @@ -2266,65 +2007,89 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, } /** - * idpf_tx_dma_map_error - handle TX DMA map errors - * @txq: queue to send buffer on - * @skb: send buffer - * @first: original first buffer info buffer for packet - * @idx: starting point on ring to unwind + * idpf_tx_splitq_bump_ntu - adjust NTU and generation + * @txq: the tx ring to wrap + * @ntu: ring index to bump */ -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 idx) +static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) { - u64_stats_update_begin(&txq->stats_sync); - u64_stats_inc(&txq->q_stats.dma_map_errs); - u64_stats_update_end(&txq->stats_sync); + ntu++; - /* clear dma mappings for failed tx_buf map */ - for (;;) { - struct idpf_tx_buf *tx_buf; + if (ntu == txq->desc_count) + ntu = 0; - tx_buf = &txq->tx_buf[idx]; - idpf_tx_buf_rel(txq, tx_buf); - if (tx_buf == first) - break; - if (idx == 0) - idx = txq->desc_count; - idx--; - } + return ntu; +} - if (skb_is_gso(skb)) { - union idpf_tx_flex_desc *tx_desc; +/** + * idpf_tx_get_free_buf_id - get a free buffer ID from the refill queue + * @refillq: refill queue to get buffer ID from + * @buf_id: return buffer ID + * + * Return: true if a buffer ID was found, false if not + */ +static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, + u32 *buf_id) +{ + u32 ntc = refillq->next_to_clean; + u32 refill_desc; - /* If we failed a DMA mapping for a TSO packet, we will have - * used one additional descriptor for a context - * descriptor. Reset that here. - */ - tx_desc = &txq->flex_tx[idx]; - memset(tx_desc, 0, sizeof(struct idpf_flex_tx_ctx_desc)); - if (idx == 0) - idx = txq->desc_count; - idx--; + refill_desc = refillq->ring[ntc]; + + if (unlikely(idpf_queue_has(RFL_GEN_CHK, refillq) != + !!(refill_desc & IDPF_RFL_BI_GEN_M))) + return false; + + *buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); + + if (unlikely(++ntc == refillq->desc_count)) { + idpf_queue_change(RFL_GEN_CHK, refillq); + ntc = 0; } - /* Update tail in case netdev_xmit_more was previously true */ - idpf_tx_buf_hw_update(txq, idx, false); + refillq->next_to_clean = ntc; + + return true; } /** - * idpf_tx_splitq_bump_ntu - adjust NTU and generation - * @txq: the tx ring to wrap - * @ntu: ring index to bump + * idpf_tx_splitq_pkt_err_unmap - Unmap buffers and bump tail in case of error + * @txq: Tx queue to unwind + * @params: pointer to splitq params struct + * @first: starting buffer for packet to unmap */ -static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) +static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, + struct idpf_tx_splitq_params *params, + struct idpf_tx_buf *first) { - ntu++; + struct idpf_sw_queue *refillq = txq->refillq; + struct libeth_sq_napi_stats ss = { }; + struct idpf_tx_buf *tx_buf = first; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; - if (ntu == txq->desc_count) { - ntu = 0; - txq->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(txq); + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + libeth_tx_complete(tx_buf, &cp); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + tx_buf = &txq->tx_buf[idpf_tx_buf_next(tx_buf)]; + libeth_tx_complete(tx_buf, &cp); } - return ntu; + /* Update tail in case netdev_xmit_more was previously true. */ + idpf_tx_buf_hw_update(txq, params->prev_ntu, false); + + if (!refillq) + return; + + /* Restore refillq state to avoid leaking tags. */ + if (params->prev_refill_gen != idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = params->prev_refill_ntc; } /** @@ -2348,6 +2113,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, struct netdev_queue *nq; struct sk_buff *skb; skb_frag_t *frag; + u32 next_buf_id; u16 td_cmd = 0; dma_addr_t dma; @@ -2363,17 +2129,19 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, dma = dma_map_single(tx_q->dev, skb->data, size, DMA_TO_DEVICE); tx_buf = first; - - params->compl_tag = - (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; + first->nr_frags = 0; for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } - tx_buf->compl_tag = params->compl_tag; + first->nr_frags++; + tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); @@ -2427,29 +2195,13 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, max_data); - tx_desc++; - i++; - - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = - IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); + } else { + tx_desc++; } - /* Since this packet has a buffer that is going to span - * multiple descriptors, it's going to leave holes in - * to the TX buffer ring. To ensure these holes do not - * cause issues in the cleaning routines, we will clear - * them of any stale data and assign them the same - * completion tag as the current packet. Then when the - * packet is being cleaned, the cleaning routines will - * simply pass over these holes and finish cleaning the - * rest of the packet. - */ - memset(&tx_q->tx_buf[i], 0, sizeof(struct idpf_tx_buf)); - tx_q->tx_buf[i].compl_tag = params->compl_tag; - /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, * max_data will be >= 12K and <= 16K-1. On any @@ -2472,40 +2224,51 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, break; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); - tx_desc++; - i++; - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); + } else { + tx_desc++; + } + + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &next_buf_id))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } + } else { + next_buf_id = i; } + idpf_tx_buf_next(tx_buf) = next_buf_id; + tx_buf = &tx_q->tx_buf[next_buf_id]; size = skb_frag_size(frag); data_len -= size; dma = skb_frag_dma_map(tx_q->dev, frag, 0, size, DMA_TO_DEVICE); - - tx_buf = &tx_q->tx_buf[i]; } /* record SW timestamp if HW timestamp is not available */ skb_tx_timestamp(skb); + first->type = LIBETH_SQE_SKB; + /* write last descriptor with RS and EOP bits */ + first->rs_idx = i; + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); - /* set next_to_watch value indicating a packet is present */ - first->next_to_watch = tx_desc; - tx_q->txq_grp->num_completions_pending++; /* record bytecount for BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - netdev_tx_sent_queue(nq, first->bytecount); + netdev_tx_sent_queue(nq, first->bytes); idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more()); } @@ -2705,9 +2468,6 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - memset(&txq->tx_buf[i], 0, sizeof(struct idpf_tx_buf)); - txq->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - /* grab the next descriptor */ desc = &txq->flex_ctx[i]; txq->next_to_use = idpf_tx_splitq_bump_ntu(txq, i); @@ -2733,6 +2493,21 @@ netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb) return NETDEV_TX_OK; } +/** + * idpf_tx_splitq_need_re - check whether RE bit needs to be set + * @tx_q: pointer to Tx queue + * + * Return: true if RE bit needs to be set, false otherwise + */ +static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) +{ + int gap = tx_q->next_to_use - tx_q->last_re; + + gap += (gap < 0) ? tx_q->desc_count : 0; + + return gap >= IDPF_TX_SPLITQ_RE_MIN_GAP; +} + /** * idpf_tx_splitq_frame - Sends buffer on Tx ring using flex descriptors * @skb: send buffer @@ -2743,12 +2518,15 @@ netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb) static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q) { - struct idpf_tx_splitq_params tx_params = { }; + struct idpf_tx_splitq_params tx_params = { + .prev_ntu = tx_q->next_to_use, + }; struct idpf_tx_buf *first; - unsigned int count; + u32 count, buf_count = 1; int tso; + u32 buf_id; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); @@ -2758,7 +2536,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, /* Check for splitq specific TX resources */ count += (IDPF_TX_DESCS_PER_CACHE_LINE + tso); - if (idpf_tx_maybe_stop_splitq(tx_q, count)) { + if (idpf_tx_maybe_stop_splitq(tx_q, count, buf_count)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); return NETDEV_TX_BUSY; @@ -2785,36 +2563,47 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, u64_stats_update_end(&tx_q->stats_sync); } - /* record the location of the first descriptor for this packet */ - first = &tx_q->tx_buf[tx_q->next_to_use]; - first->skb = skb; + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + struct idpf_sw_queue *refillq = tx_q->refillq; - if (tso) { - first->gso_segs = tx_params.offload.tso_segs; - first->bytecount = skb->len + - ((first->gso_segs - 1) * tx_params.offload.tso_hdr_len); - } else { - first->gso_segs = 1; - first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); - } + /* Save refillq state in case of a packet rollback. Otherwise, + * the tags will be leaked since they will be popped from the + * refillq but never reposted during cleaning. + */ + tx_params.prev_refill_gen = + idpf_queue_has(RFL_GEN_CHK, refillq); + tx_params.prev_refill_ntc = refillq->next_to_clean; + + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &buf_id))) { + if (tx_params.prev_refill_gen != + idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = tx_params.prev_refill_ntc; + + tx_q->next_to_use = tx_params.prev_ntu; + return idpf_tx_drop_skb(tx_q, skb); + } + tx_params.compl_tag = buf_id; - if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; - /* Set the RE bit to catch any packets that may have not been - * stashed during RS completion cleaning. MIN_GAP is set to - * MIN_RING size to ensure it will be set at least once each - * time around the ring. + /* Set the RE bit to periodically "clean" the descriptor ring. + * MIN_GAP is set to MIN_RING size to ensure it will be set at + * least once each time around the ring. */ - if (!(tx_q->next_to_use % IDPF_TX_SPLITQ_RE_MIN_GAP)) { + if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; tx_q->txq_grp->num_completions_pending++; + tx_q->last_re = tx_q->next_to_use; } if (skb->ip_summed == CHECKSUM_PARTIAL) tx_params.offload.td_cmd |= IDPF_TXD_FLEX_FLOW_CMD_CS_EN; } else { + buf_id = tx_q->next_to_use; + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2; tx_params.eop_cmd = IDPF_TXD_LAST_DESC_CMD; @@ -2822,6 +2611,18 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TX_FLEX_DESC_CMD_CS_EN; } + first = &tx_q->tx_buf[buf_id]; + first->skb = skb; + + if (tso) { + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); + } else { + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + } + idpf_tx_splitq_map(tx_q, &tx_params, first); return NETDEV_TX_OK; @@ -3375,7 +3176,7 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) if (!skb) break; - idpf_rx_post_buf_refill(refillq, buf_id); + idpf_post_buf_refill(refillq, buf_id); IDPF_RX_BUMP_NTC(rxq, ntc); /* skip if it is non EOP desc */ @@ -3473,10 +3274,10 @@ static void idpf_rx_clean_refillq(struct idpf_buf_queue *bufq, bool failure; if (idpf_queue_has(RFL_GEN_CHK, refillq) != - !!(refill_desc & IDPF_RX_BI_GEN_M)) + !!(refill_desc & IDPF_RFL_BI_GEN_M)) break; - buf_id = FIELD_GET(IDPF_RX_BI_BUFID_M, refill_desc); + buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); failure = idpf_rx_update_bufq_desc(bufq, buf_id, buf_desc); if (failure) break; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index f119f240d21cd..1011e25e9f925 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -113,8 +113,8 @@ do { \ */ #define IDPF_TX_SPLITQ_RE_MIN_GAP 64 -#define IDPF_RX_BI_GEN_M BIT(16) -#define IDPF_RX_BI_BUFID_M GENMASK(15, 0) +#define IDPF_RFL_BI_GEN_M BIT(16) +#define IDPF_RFL_BI_BUFID_M GENMASK(15, 0) #define IDPF_RXD_EOF_SPLITQ VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_EOF_M #define IDPF_RXD_EOF_SINGLEQ VIRTCHNL2_RX_BASE_DESC_STATUS_EOF_M @@ -123,25 +123,16 @@ do { \ ((((txq)->next_to_clean > (txq)->next_to_use) ? 0 : (txq)->desc_count) + \ (txq)->next_to_clean - (txq)->next_to_use - 1) -#define IDPF_TX_BUF_RSV_UNUSED(txq) ((txq)->stash->buf_stack.top) -#define IDPF_TX_BUF_RSV_LOW(txq) (IDPF_TX_BUF_RSV_UNUSED(txq) < \ - (txq)->desc_count >> 2) - #define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 1) /* Determine the absolute number of completions pending, i.e. the number of * completions that are expected to arrive on the TX completion queue. */ #define IDPF_TX_COMPLQ_PENDING(txq) \ (((txq)->num_completions_pending >= (txq)->complq->num_completions ? \ - 0 : U64_MAX) + \ + 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) -#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -#define IDPF_SPLITQ_TX_INVAL_COMPL_TAG -1 -/* Adjust the generation for the completion tag and wrap if necessary */ -#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ - ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ - 0 : (txq)->compl_tag_cur_gen) +#define IDPF_TXBUF_NULL U32_MAX #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) @@ -155,59 +146,7 @@ union idpf_tx_flex_desc { struct idpf_flex_tx_sched_desc flow; /* flow based scheduling */ }; -/** - * struct idpf_tx_buf - * @next_to_watch: Next descriptor to clean - * @skb: Pointer to the skb - * @dma: DMA address - * @len: DMA length - * @bytecount: Number of bytes - * @gso_segs: Number of GSO segments - * @compl_tag: Splitq only, unique identifier for a buffer. Used to compare - * with completion tag returned in buffer completion event. - * Because the completion tag is expected to be the same in all - * data descriptors for a given packet, and a single packet can - * span multiple buffers, we need this field to track all - * buffers associated with this completion tag independently of - * the buf_id. The tag consists of a N bit buf_id and M upper - * order "generation bits". See compl_tag_bufid_m and - * compl_tag_gen_s in struct idpf_queue. We'll use a value of -1 - * to indicate the tag is not valid. - * @ctx_entry: Singleq only. Used to indicate the corresponding entry - * in the descriptor ring was used for a context descriptor and - * this buffer entry should be skipped. - */ -struct idpf_tx_buf { - void *next_to_watch; - struct sk_buff *skb; - DEFINE_DMA_UNMAP_ADDR(dma); - DEFINE_DMA_UNMAP_LEN(len); - unsigned int bytecount; - unsigned short gso_segs; - - union { - int compl_tag; - - bool ctx_entry; - }; -}; - -struct idpf_tx_stash { - struct hlist_node hlist; - struct idpf_tx_buf buf; -}; - -/** - * struct idpf_buf_lifo - LIFO for managing OOO completions - * @top: Used to know how many buffers are left - * @size: Total size of LIFO - * @bufs: Backing array - */ -struct idpf_buf_lifo { - u16 top; - u16 size; - struct idpf_tx_stash **bufs; -}; +#define idpf_tx_buf libeth_sqe /** * struct idpf_tx_offload_params - Offload parameters for a given packet @@ -241,6 +180,9 @@ struct idpf_tx_offload_params { * @compl_tag: Associated tag for completion * @td_tag: Descriptor tunneling tag * @offload: Offload parameters + * @prev_ntu: stored TxQ next_to_use in case of rollback + * @prev_refill_ntc: stored refillq next_to_clean in case of packet rollback + * @prev_refill_gen: stored refillq generation bit in case of packet rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -251,6 +193,10 @@ struct idpf_tx_splitq_params { }; struct idpf_tx_offload_params offload; + + u16 prev_ntu; + u16 prev_refill_ntc; + bool prev_refill_gen; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -589,11 +535,6 @@ struct idpf_tx_queue_stats { u64_stats_t dma_map_errs; }; -struct idpf_cleaned_stats { - u32 packets; - u32 bytes; -}; - #define IDPF_ITR_DYNAMIC 1 #define IDPF_ITR_MAX 0x1FE0 #define IDPF_ITR_20K 0x0032 @@ -610,17 +551,6 @@ struct idpf_cleaned_stats { #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) #define IDPF_DIM_DEFAULT_PROFILE_IX 1 -/** - * struct idpf_txq_stash - Tx buffer stash for Flow-based scheduling mode - * @buf_stack: Stack of empty buffers to store buffer info for out of order - * buffer completions. See struct idpf_buf_lifo - * @sched_buf_hash: Hash table to store buffers - */ -struct idpf_txq_stash { - struct idpf_buf_lifo buf_stack; - DECLARE_HASHTABLE(sched_buf_hash, 12); -} ____cacheline_aligned; - /** * struct idpf_rx_queue - software structure representing a receive queue * @rx: universal receive descriptor array @@ -718,6 +648,8 @@ struct idpf_rx_queue { * @desc_count: Number of descriptors * @next_to_use: Next descriptor to use * @next_to_clean: Next descriptor to clean + * @last_re: last descriptor index that RE bit was set + * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @netdev: &net_device corresponding to this queue * @cleaned_bytes: Splitq only, TXQ only: When a TX completion is received on * the TX completion queue, it can be for any TXQ associated @@ -729,9 +661,8 @@ struct idpf_rx_queue { * only once at the end of the cleaning routine. * @clean_budget: singleq only, queue cleaning budget * @cleaned_pkts: Number of packets cleaned for the above said case - * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @tx_min_pkt_len: Min supported packet length - * @compl_tag_bufid_m: Completion tag buffer id mask + * @refillq: Pointer to refill queue * @compl_tag_gen_s: Completion tag generation bit * The format of the completion tag will change based on the TXQ * descriptor ring size so that we can maintain roughly the same level @@ -752,15 +683,13 @@ struct idpf_rx_queue { * -------------------------------- * * This gives us 8*8160 = 65280 possible unique values. - * @compl_tag_cur_gen: Used to keep track of current completion tag generation - * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset - * @stash: Tx buffer stash for Flow-based scheduling mode * @stats_sync: See struct u64_stats_sync * @q_stats: See union idpf_tx_queue_stats * @q_id: Queue id * @size: Length of descriptor ring in bytes * @dma: Physical address of ring * @q_vector: Backreference to associated vector + * @buf_pool_size: Total number of idpf_tx_buf */ struct idpf_tx_queue { union { @@ -771,7 +700,7 @@ struct idpf_tx_queue { void *desc_ring; }; - struct idpf_tx_buf *tx_buf; + struct libeth_sqe *tx_buf; struct idpf_txq_group *txq_grp; struct device *dev; void __iomem *tail; @@ -781,6 +710,8 @@ struct idpf_tx_queue { u16 desc_count; u16 next_to_use; u16 next_to_clean; + u16 last_re; + u16 tx_max_bufs; struct net_device *netdev; @@ -790,16 +721,8 @@ struct idpf_tx_queue { }; u16 cleaned_pkts; - u16 tx_max_bufs; u16 tx_min_pkt_len; - - u16 compl_tag_bufid_m; - u16 compl_tag_gen_s; - - u16 compl_tag_cur_gen; - u16 compl_tag_gen_max; - - struct idpf_txq_stash *stash; + struct idpf_sw_queue *refillq; struct u64_stats_sync stats_sync; struct idpf_tx_queue_stats q_stats; @@ -810,6 +733,7 @@ struct idpf_tx_queue { dma_addr_t dma; struct idpf_q_vector *q_vector; + u32 buf_pool_size; } ____cacheline_aligned; /** @@ -893,7 +817,7 @@ struct idpf_compl_queue { struct net_device *netdev; u32 clean_budget; - u32 num_completions; + aligned_u64 num_completions; /* Slowpath */ u32 q_id; @@ -995,7 +919,6 @@ struct idpf_rxq_group { * @vport: Vport back pointer * @num_txq: Number of TX queues associated * @txqs: Array of TX queue pointers - * @stashes: array of OOO stashes for the queues * @complq: Associated completion queue pointer, split queue only * @num_completions_pending: Total number of completions pending for the * completion queue, acculumated for all TX queues @@ -1010,11 +933,10 @@ struct idpf_txq_group { u16 num_txq; struct idpf_tx_queue *txqs[IDPF_LARGE_MAX_Q]; - struct idpf_txq_stash *stashes; struct idpf_compl_queue *complq; - u32 num_completions_pending; + aligned_u64 num_completions_pending; }; /** @@ -1143,6 +1065,17 @@ static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector) reg->dyn_ctl); } +/** + * idpf_tx_splitq_get_free_bufs - get number of free buf_ids in refillq + * @refillq: pointer to refillq containing buf_ids + */ +static inline u32 idpf_tx_splitq_get_free_bufs(struct idpf_sw_queue *refillq) +{ + return (refillq->next_to_use > refillq->next_to_clean ? + 0 : refillq->desc_count) + + refillq->next_to_use - refillq->next_to_clean - 1; +} + int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget); void idpf_vport_init_num_qs(struct idpf_vport *vport, struct virtchnl2_create_vport *vport_msg); @@ -1173,10 +1106,8 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 ring_idx); -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb); +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, u32 *buf_count); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 799726f4ec47f..bd8a364c9e21c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4424,7 +4424,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, /* Verify if UDP port is being offloaded by HW */ if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port)) - return features; + return vxlan_features_check(skb, features); #if IS_ENABLED(CONFIG_GENEVE) /* Support Geneve offload for default UDP port */ @@ -4450,7 +4450,6 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb, struct mlx5e_priv *priv = netdev_priv(netdev); features = vlan_features_check(skb, features); - features = vxlan_features_check(skb, features); /* Validate if the tunneled packet is being offloaded by HW */ if (skb->encapsulation && diff --git a/drivers/net/ethernet/microsoft/Kconfig b/drivers/net/ethernet/microsoft/Kconfig index 090e6b9832431..77bc47cbfbbf4 100644 --- a/drivers/net/ethernet/microsoft/Kconfig +++ b/drivers/net/ethernet/microsoft/Kconfig @@ -17,7 +17,8 @@ if NET_VENDOR_MICROSOFT config MICROSOFT_MANA tristate "Microsoft Azure Network Adapter (MANA) support" - depends on PCI_MSI && X86_64 + depends on PCI_MSI + depends on X86_64 || (ARM64 && !CPU_BIG_ENDIAN) depends on PCI_HYPERV select AUXILIARY_BUS help diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index f1b4b0b6ae65b..8d9e019bf1162 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -182,7 +182,7 @@ int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, dma_addr_t dma_handle; void *buf; - if (length < PAGE_SIZE || !is_power_of_2(length)) + if (length < MANA_PAGE_SIZE || !is_power_of_2(length)) return -EINVAL; gmi->dev = gc->dev; @@ -717,7 +717,7 @@ EXPORT_SYMBOL(mana_gd_destroy_dma_region); static int mana_gd_create_dma_region(struct gdma_dev *gd, struct gdma_mem_info *gmi) { - unsigned int num_page = gmi->length / PAGE_SIZE; + unsigned int num_page = gmi->length / MANA_PAGE_SIZE; struct gdma_create_dma_region_req *req = NULL; struct gdma_create_dma_region_resp resp = {}; struct gdma_context *gc = gd->gdma_context; @@ -727,10 +727,10 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, int err; int i; - if (length < PAGE_SIZE || !is_power_of_2(length)) + if (length < MANA_PAGE_SIZE || !is_power_of_2(length)) return -EINVAL; - if (offset_in_page(gmi->virt_addr) != 0) + if (!MANA_PAGE_ALIGNED(gmi->virt_addr)) return -EINVAL; hwc = gc->hwc.driver_data; @@ -751,7 +751,7 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, req->page_addr_list_len = num_page; for (i = 0; i < num_page; i++) - req->page_addr_list[i] = gmi->dma_handle + i * PAGE_SIZE; + req->page_addr_list[i] = gmi->dma_handle + i * MANA_PAGE_SIZE; err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp); if (err) diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 3a31ba66b821e..d2a339bc1cd25 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -361,12 +361,12 @@ static int mana_hwc_create_cq(struct hw_channel_context *hwc, u16 q_depth, int err; eq_size = roundup_pow_of_two(GDMA_EQE_SIZE * q_depth); - if (eq_size < MINIMUM_SUPPORTED_PAGE_SIZE) - eq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + if (eq_size < MANA_MIN_QSIZE) + eq_size = MANA_MIN_QSIZE; cq_size = roundup_pow_of_two(GDMA_CQE_SIZE * q_depth); - if (cq_size < MINIMUM_SUPPORTED_PAGE_SIZE) - cq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + if (cq_size < MANA_MIN_QSIZE) + cq_size = MANA_MIN_QSIZE; hwc_cq = kzalloc(sizeof(*hwc_cq), GFP_KERNEL); if (!hwc_cq) @@ -428,7 +428,7 @@ static int mana_hwc_alloc_dma_buf(struct hw_channel_context *hwc, u16 q_depth, dma_buf->num_reqs = q_depth; - buf_size = PAGE_ALIGN(q_depth * max_msg_size); + buf_size = MANA_PAGE_ALIGN(q_depth * max_msg_size); gmi = &dma_buf->mem_info; err = mana_gd_alloc_memory(gc, buf_size, gmi); @@ -496,8 +496,8 @@ static int mana_hwc_create_wq(struct hw_channel_context *hwc, else queue_size = roundup_pow_of_two(GDMA_MAX_SQE_SIZE * q_depth); - if (queue_size < MINIMUM_SUPPORTED_PAGE_SIZE) - queue_size = MINIMUM_SUPPORTED_PAGE_SIZE; + if (queue_size < MANA_MIN_QSIZE) + queue_size = MANA_MIN_QSIZE; hwc_wq = kzalloc(sizeof(*hwc_wq), GFP_KERNEL); if (!hwc_wq) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index e16317aadbca4..d80dd8baefdeb 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1867,10 +1867,10 @@ static int mana_create_txq(struct mana_port_context *apc, * to prevent overflow. */ txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32; - BUILD_BUG_ON(!PAGE_ALIGNED(txq_size)); + BUILD_BUG_ON(!MANA_PAGE_ALIGNED(txq_size)); cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE; - cq_size = PAGE_ALIGN(cq_size); + cq_size = MANA_PAGE_ALIGN(cq_size); gc = gd->gdma_context; @@ -2128,8 +2128,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, if (err) goto out; - rq_size = PAGE_ALIGN(rq_size); - cq_size = PAGE_ALIGN(cq_size); + rq_size = MANA_PAGE_ALIGN(rq_size); + cq_size = MANA_PAGE_ALIGN(cq_size); /* Create RQ */ memset(&spec, 0, sizeof(spec)); diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.c b/drivers/net/ethernet/microsoft/mana/shm_channel.c index 5553af9c8085a..0f1679ebad96b 100644 --- a/drivers/net/ethernet/microsoft/mana/shm_channel.c +++ b/drivers/net/ethernet/microsoft/mana/shm_channel.c @@ -6,6 +6,7 @@ #include #include +#include #include #define PAGE_FRAME_L48_WIDTH_BYTES 6 @@ -155,8 +156,8 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, return err; } - if (!PAGE_ALIGNED(eq_addr) || !PAGE_ALIGNED(cq_addr) || - !PAGE_ALIGNED(rq_addr) || !PAGE_ALIGNED(sq_addr)) + if (!MANA_PAGE_ALIGNED(eq_addr) || !MANA_PAGE_ALIGNED(cq_addr) || + !MANA_PAGE_ALIGNED(rq_addr) || !MANA_PAGE_ALIGNED(sq_addr)) return -EINVAL; if ((eq_msix_index & VECTOR_MASK) != eq_msix_index) @@ -183,7 +184,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* EQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(eq_addr); + frame_addr = MANA_PFN(eq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); @@ -191,7 +192,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* CQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(cq_addr); + frame_addr = MANA_PFN(cq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); @@ -199,7 +200,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* RQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(rq_addr); + frame_addr = MANA_PFN(rq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); @@ -207,7 +208,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* SQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(sq_addr); + frame_addr = MANA_PFN(sq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 74ca64e0a2c8a..fa2ad8299d9b7 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -1647,6 +1647,7 @@ void nsim_drv_remove(struct nsim_bus_dev *nsim_bus_dev) ARRAY_SIZE(nsim_devlink_params)); devl_resources_unregister(devlink); kfree(nsim_dev->vfconfigs); + kfree(nsim_dev->fa_cookie); devl_unlock(devlink); devlink_free(devlink); dev_set_drvdata(&nsim_bus_dev->dev, NULL); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f41e96f0e847e..35de6f091014c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -152,6 +152,18 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type) +{ + switch (type) { + case nvme_tcp_c2h_data: + case nvme_tcp_r2t: + case nvme_tcp_rsp: + return true; + default: + return false; + } +} + static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) { u32 queue_idx = nvme_tcp_queue_id(queue); @@ -674,6 +686,16 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) { + if (!nvme_tcp_recv_pdu_supported(hdr->type)) + goto unsupported_pdu; + + dev_err(queue->ctrl->ctrl.device, + "pdu type %d has unexpected header length (%d)\n", + hdr->type, hdr->hlen); + return -EPROTO; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) @@ -697,10 +719,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_init_recv_ctx(queue); return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: - dev_err(queue->ctrl->ctrl.device, - "unsupported pdu type (%d)\n", hdr->type); - return -EINVAL; + goto unsupported_pdu; } + +unsupported_pdu: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 14cac6a5598f3..5a26f8fc4319b 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -742,6 +742,7 @@ static int sriov_init(struct pci_dev *dev, int pos) struct pci_sriov *iov; struct resource *res; struct pci_dev *pdev; + u32 sriovbars[PCI_SRIOV_NUM_BARS]; pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl); if (ctrl & PCI_SRIOV_CTRL_VFE) { @@ -778,6 +779,10 @@ static int sriov_init(struct pci_dev *dev, int pos) if (!iov) return -ENOMEM; + /* Sizing SR-IOV BARs with VF Enable cleared - no decode */ + __pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS, + pos + PCI_SRIOV_BAR, sriovbars); + nres = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; @@ -789,7 +794,8 @@ static int sriov_init(struct pci_dev *dev, int pos) bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0; else bar64 = __pci_read_base(dev, pci_bar_unknown, res, - pos + PCI_SRIOV_BAR + i * 4); + pos + PCI_SRIOV_BAR + i * 4, + &sriovbars[i]); if (!res->flags) continue; if (resource_size(res) & (PAGE_SIZE - 1)) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ce956ebfda087..1421ea9e83a9a 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -213,8 +213,10 @@ bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); int pci_setup_device(struct pci_dev *dev); +void __pci_size_stdbars(struct pci_dev *dev, int count, + unsigned int pos, u32 *sizes); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, - struct resource *res, unsigned int reg); + struct resource *res, unsigned int reg, u32 *sizes); void pci_configure_ari(struct pci_dev *dev); void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 5b1b204fd3d43..c0af7d3d31bc7 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -170,40 +170,66 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar) #define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO) +/** + * __pci_size_bars - Read the raw BAR mask for a range of PCI BARs + * @dev: the PCI device + * @count: number of BARs to size + * @pos: starting config space position + * @sizes: array to store mask values + * @rom: indicate whether to use ROM mask, which avoids enabling ROM BARs + * + * Provided @sizes array must be sufficiently sized to store results for + * @count u32 BARs. Caller is responsible for disabling decode to specified + * BAR range around calling this function. This function is intended to avoid + * disabling decode around sizing each BAR individually, which can result in + * non-trivial overhead in virtualized environments with very large PCI BARs. + */ +static void __pci_size_bars(struct pci_dev *dev, int count, + unsigned int pos, u32 *sizes, bool rom) +{ + u32 orig, mask = rom ? PCI_ROM_ADDRESS_MASK : ~0; + int i; + + for (i = 0; i < count; i++, pos += 4, sizes++) { + pci_read_config_dword(dev, pos, &orig); + pci_write_config_dword(dev, pos, mask); + pci_read_config_dword(dev, pos, sizes); + pci_write_config_dword(dev, pos, orig); + } +} + +void __pci_size_stdbars(struct pci_dev *dev, int count, + unsigned int pos, u32 *sizes) +{ + __pci_size_bars(dev, count, pos, sizes, false); +} + +static void __pci_size_rom(struct pci_dev *dev, unsigned int pos, u32 *sizes) +{ + __pci_size_bars(dev, 1, pos, sizes, true); +} + /** * __pci_read_base - Read a PCI BAR * @dev: the PCI device * @type: type of the BAR * @res: resource buffer to be filled in * @pos: BAR position in the config space + * @sizes: array of one or more pre-read BAR masks * * Returns 1 if the BAR is 64-bit, or 0 if 32-bit. */ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, - struct resource *res, unsigned int pos) + struct resource *res, unsigned int pos, u32 *sizes) { - u32 l = 0, sz = 0, mask; + u32 l = 0, sz; u64 l64, sz64, mask64; - u16 orig_cmd; struct pci_bus_region region, inverted_region; - mask = type ? PCI_ROM_ADDRESS_MASK : ~0; - - /* No printks while decoding is disabled! */ - if (!dev->mmio_always_on) { - pci_read_config_word(dev, PCI_COMMAND, &orig_cmd); - if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) { - pci_write_config_word(dev, PCI_COMMAND, - orig_cmd & ~PCI_COMMAND_DECODE_ENABLE); - } - } - res->name = pci_name(dev); pci_read_config_dword(dev, pos, &l); - pci_write_config_dword(dev, pos, l | mask); - pci_read_config_dword(dev, pos, &sz); - pci_write_config_dword(dev, pos, l); + sz = sizes[0]; /* * All bits set in sz means the device isn't working properly. @@ -243,18 +269,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, if (res->flags & IORESOURCE_MEM_64) { pci_read_config_dword(dev, pos + 4, &l); - pci_write_config_dword(dev, pos + 4, ~0); - pci_read_config_dword(dev, pos + 4, &sz); - pci_write_config_dword(dev, pos + 4, l); + sz = sizes[1]; l64 |= ((u64)l << 32); sz64 |= ((u64)sz << 32); mask64 |= ((u64)~0 << 32); } - if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE)) - pci_write_config_word(dev, PCI_COMMAND, orig_cmd); - if (!sz64) goto fail; @@ -326,7 +347,11 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) { + u32 rombar, stdbars[PCI_STD_NUM_BARS]; unsigned int pos, reg; + u16 orig_cmd; + + BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS); if (dev->non_compliant_bars) return; @@ -335,10 +360,28 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) if (dev->is_virtfn) return; + /* No printks while decoding is disabled! */ + if (!dev->mmio_always_on) { + pci_read_config_word(dev, PCI_COMMAND, &orig_cmd); + if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) { + pci_write_config_word(dev, PCI_COMMAND, + orig_cmd & ~PCI_COMMAND_DECODE_ENABLE); + } + } + + __pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars); + if (rom) + __pci_size_rom(dev, rom, &rombar); + + if (!dev->mmio_always_on && + (orig_cmd & PCI_COMMAND_DECODE_ENABLE)) + pci_write_config_word(dev, PCI_COMMAND, orig_cmd); + for (pos = 0; pos < howmany; pos++) { struct resource *res = &dev->resource[pos]; reg = PCI_BASE_ADDRESS_0 + (pos << 2); - pos += __pci_read_base(dev, pci_bar_unknown, res, reg); + pos += __pci_read_base(dev, pci_bar_unknown, + res, reg, &stdbars[pos]); } if (rom) { @@ -346,7 +389,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) dev->rom_base_reg = rom; res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | IORESOURCE_READONLY | IORESOURCE_SIZEALIGN; - __pci_read_base(dev, pci_bar_mem32, res, rom); + __pci_read_base(dev, pci_bar_mem32, res, rom, &rombar); } } diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index cc824c6e19a7b..f66dde9adda1c 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1398,14 +1398,19 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, } /* - * Our channel array is sparsley populated and we + * Our channel array could be sparsley populated and we * initiated I/O on a processor/hw-q that does not * currently have a designated channel. Fix this. * The strategy is simple: - * I. Ensure NUMA locality - * II. Distribute evenly (best effort) + * I. Prefer the channel associated with the current CPU + * II. Ensure NUMA locality + * III. Distribute evenly (best effort) */ + /* Prefer the channel on the I/O issuing processor/hw-q */ + if (cpumask_test_cpu(q_num, &stor_device->alloced_cpus)) + return stor_device->stor_chns[q_num]; + node_mask = cpumask_of_node(cpu_to_node(q_num)); num_channels = 0; @@ -1461,59 +1466,48 @@ static int storvsc_do_io(struct hv_device *device, /* See storvsc_change_target_cpu(). */ outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); if (outgoing_channel != NULL) { - if (outgoing_channel->target_cpu == q_num) { - /* - * Ideally, we want to pick a different channel if - * available on the same NUMA node. - */ - node_mask = cpumask_of_node(cpu_to_node(q_num)); - for_each_cpu_wrap(tgt_cpu, - &stor_device->alloced_cpus, q_num + 1) { - if (!cpumask_test_cpu(tgt_cpu, node_mask)) - continue; - if (tgt_cpu == q_num) - continue; - channel = READ_ONCE( - stor_device->stor_chns[tgt_cpu]); - if (channel == NULL) - continue; - if (hv_get_avail_to_write_percent( - &channel->outbound) - > ring_avail_percent_lowater) { - outgoing_channel = channel; - goto found_channel; - } - } + if (hv_get_avail_to_write_percent(&outgoing_channel->outbound) + > ring_avail_percent_lowater) + goto found_channel; - /* - * All the other channels on the same NUMA node are - * busy. Try to use the channel on the current CPU - */ - if (hv_get_avail_to_write_percent( - &outgoing_channel->outbound) - > ring_avail_percent_lowater) + /* + * Channel is busy, try to find a channel on the same NUMA node + */ + node_mask = cpumask_of_node(cpu_to_node(q_num)); + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + if (!cpumask_test_cpu(tgt_cpu, node_mask)) + continue; + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; goto found_channel; + } + } - /* - * If we reach here, all the channels on the current - * NUMA node are busy. Try to find a channel in - * other NUMA nodes - */ - for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { - if (cpumask_test_cpu(tgt_cpu, node_mask)) - continue; - channel = READ_ONCE( - stor_device->stor_chns[tgt_cpu]); - if (channel == NULL) - continue; - if (hv_get_avail_to_write_percent( - &channel->outbound) - > ring_avail_percent_lowater) { - outgoing_channel = channel; - goto found_channel; - } + /* + * If we reach here, all the channels on the current + * NUMA node are busy. Try to find a channel in + * all NUMA nodes + */ + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; + goto found_channel; } } + /* + * If we reach here, all the channels are busy. Use the + * original channel found. + */ } else { spin_lock_irqsave(&stor_device->lock, flags); outgoing_channel = stor_device->stor_chns[q_num]; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cef072e7cfbbc..de45924a74c49 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3699,6 +3699,17 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) #endif } +/** + * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue + * @dev: network device + * @qid: stack index of the queue to reset + */ +static inline void netdev_tx_reset_subqueue(const struct net_device *dev, + u32 qid) +{ + netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid)); +} + /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device @@ -3708,7 +3719,7 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) */ static inline void netdev_reset_queue(struct net_device *dev_queue) { - netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); + netdev_tx_reset_subqueue(dev_queue, 0); } /** diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h new file mode 100644 index 0000000000000..35614f9523f60 --- /dev/null +++ b/include/net/libeth/tx.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TX_H +#define __LIBETH_TX_H + +#include + +#include + +/* Tx buffer completion */ + +/** + * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion + * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required + * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() + * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA + * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + */ +enum libeth_sqe_type { + LIBETH_SQE_EMPTY = 0U, + LIBETH_SQE_CTX, + LIBETH_SQE_SLAB, + LIBETH_SQE_FRAG, + LIBETH_SQE_SKB, +}; + +/** + * struct libeth_sqe - represents a Send Queue Element / Tx buffer + * @type: type of the buffer, see the enum above + * @rs_idx: index of the last buffer from the batch this one was sent in + * @raw: slab buffer to free via kfree() + * @skb: &sk_buff to consume + * @dma: DMA address to unmap + * @len: length of the mapped region to unmap + * @nr_frags: number of frags in the frame this buffer belongs to + * @packets: number of physical packets sent for this frame + * @bytes: number of physical bytes sent for this frame + * @priv: driver-private scratchpad + */ +struct libeth_sqe { + enum libeth_sqe_type type:32; + u32 rs_idx; + + union { + void *raw; + struct sk_buff *skb; + }; + + DEFINE_DMA_UNMAP_ADDR(dma); + DEFINE_DMA_UNMAP_LEN(len); + + u32 nr_frags; + u32 packets; + u32 bytes; + + unsigned long priv; +} __aligned_largest; + +/** + * LIBETH_SQE_CHECK_PRIV - check the driver's private SQE data + * @p: type or name of the object the driver wants to fit into &libeth_sqe + * + * Make sure the driver's private data fits into libeth_sqe::priv. To be used + * right after its declaration. + */ +#define LIBETH_SQE_CHECK_PRIV(p) \ + static_assert(sizeof(p) <= sizeof_field(struct libeth_sqe, priv)) + +/** + * struct libeth_cq_pp - completion queue poll params + * @dev: &device to perform DMA unmapping + * @ss: onstack NAPI stats to fill + * @napi: whether it's called from the NAPI context + * + * libeth uses this structure to access objects needed for performing full + * Tx complete operation without passing lots of arguments and change the + * prototypes each time a new one is added. + */ +struct libeth_cq_pp { + struct device *dev; + struct libeth_sq_napi_stats *ss; + + bool napi; +}; + +/** + * libeth_tx_complete - perform Tx completion for one SQE + * @sqe: SQE to complete + * @cp: poll params + * + * Do Tx complete for all the types of buffers, incl. freeing, unmapping, + * updating the stats etc. + */ +static inline void libeth_tx_complete(struct libeth_sqe *sqe, + const struct libeth_cq_pp *cp) +{ + switch (sqe->type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_SKB: + case LIBETH_SQE_FRAG: + case LIBETH_SQE_SLAB: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (sqe->type) { + case LIBETH_SQE_SKB: + cp->ss->packets += sqe->packets; + cp->ss->bytes += sqe->bytes; + + napi_consume_skb(sqe->skb, cp->napi); + break; + case LIBETH_SQE_SLAB: + kfree(sqe->raw); + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +#endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h new file mode 100644 index 0000000000000..603825e451339 --- /dev/null +++ b/include/net/libeth/types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TYPES_H +#define __LIBETH_TYPES_H + +#include + +/** + * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @raw: alias to access all the fields as an array + */ +struct libeth_sq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +#endif /* __LIBETH_TYPES_H */ diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 27684135bb4d1..35507588a14d5 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -224,7 +224,15 @@ struct gdma_dev { struct auxiliary_device *adev; }; -#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE +/* MANA_PAGE_SIZE is the DMA unit */ +#define MANA_PAGE_SHIFT 12 +#define MANA_PAGE_SIZE BIT(MANA_PAGE_SHIFT) +#define MANA_PAGE_ALIGN(x) ALIGN((x), MANA_PAGE_SIZE) +#define MANA_PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), MANA_PAGE_SIZE) +#define MANA_PFN(a) ((a) >> MANA_PAGE_SHIFT) + +/* Required by HW */ +#define MANA_MIN_QSIZE MANA_PAGE_SIZE #define GDMA_CQE_SIZE 64 #define GDMA_EQE_SIZE 16 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 5da352adf1d68..6e4fee310e865 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -40,7 +40,8 @@ enum TRI_STATE { #define MAX_SEND_BUFFERS_PER_QUEUE 256 -#define EQ_SIZE (8 * PAGE_SIZE) +#define EQ_SIZE (8 * MANA_PAGE_SIZE) + #define LOG2_EQ_THROTTLE 3 #define MAX_PORTS_IN_MANA_DEV 256 diff --git a/net/core/filter.c b/net/core/filter.c index 06c6ff7fb511b..2bffec483f26e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3497,13 +3497,20 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Due to header grow, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } } return 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d5aee2190943e..d12b9e82e8478 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -275,9 +276,32 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; - if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) - return __udp_gso_segment_list(gso_skb, features, is_ipv6); + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { + /* Detect modified geometry and pass those to skb_segment. */ + if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) + return __udp_gso_segment_list(gso_skb, features, is_ipv6); + + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ + gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; + gso_skb->csum_offset = offsetof(struct udphdr, check); + gso_skb->ip_summed = CHECKSUM_PARTIAL; + + uh = udp_hdr(gso_skb); + if (is_ipv6) + uh->check = ~udp_v6_check(gso_skb->len, + &ipv6_hdr(gso_skb)->saddr, + &ipv6_hdr(gso_skb)->daddr, 0); + else + uh->check = ~udp_v4_check(gso_skb->len, + ip_hdr(gso_skb)->saddr, + ip_hdr(gso_skb)->daddr, 0); + } mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 32009d7bd73bc..4ffba186c0bea 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -803,6 +803,7 @@ static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); struct udp_replicast *rcast, *tmp; + struct tipc_net *tn; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { dst_cache_destroy(&rcast->dst_cache); @@ -810,10 +811,14 @@ static void cleanup_bearer(struct work_struct *work) kfree_rcu(rcast, rcu); } - atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); + tn = tipc_net(sock_net(ub->ubsock->sk)); + dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); + + /* Note: could use a call_rcu() to avoid another synchronize_net() */ synchronize_net(); + atomic_dec(&tn->wq_count); kfree(ub); }