From a3cd1bc78003e364c1b5a2d656be63c59efeb6be Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Wed, 28 Aug 2024 14:48:59 -0700 Subject: [PATCH 01/50] crypto: jitter - replace LFSR with SHA3-256 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the kernel crypto API, the SHA3-256 algorithm is used as conditioning element to replace the LFSR in the Jitter RNG. All other parts of the Jitter RNG are unchanged. The application and use of the SHA-3 conditioning operation is identical to the user space Jitter RNG 3.4.0 by applying the following concept: - the Jitter RNG initializes a SHA-3 state which acts as the "entropy pool" when the Jitter RNG is allocated. - When a new time delta is obtained, it is inserted into the "entropy pool" with a SHA-3 update operation. Note, this operation in most of the cases is a simple memcpy() onto the SHA-3 stack. - To cause a true SHA-3 operation for each time delta operation, a second SHA-3 operation is performed hashing Jitter RNG status information. The final message digest is also inserted into the "entropy pool" with a SHA-3 update operation. Yet, this data is not considered to provide any entropy, but it shall stir the entropy pool. - To generate a random number, a SHA-3 final operation is performed to calculate a message digest followed by an immediate SHA-3 init to re-initialize the "entropy pool". The obtained message digest is one block of the Jitter RNG that is returned to the caller. Mathematically speaking, the random number generated by the Jitter RNG is: aux_t = SHA-3(Jitter RNG state data) Jitter RNG block = SHA-3(time_i || aux_i || time_(i-1) || aux_(i-1) || ... || time_(i-255) || aux_(i-255)) when assuming that the OSR = 1, i.e. the default value. This operation implies that the Jitter RNG has an output-blocksize of 256 bits instead of the 64 bits of the LFSR-based Jitter RNG that is replaced with this patch. The patch also replaces the varying number of invocations of the conditioning function with one fixed number of invocations. The use of the conditioning function consistent with the userspace Jitter RNG library version 3.4.0. The code is tested with a system that exhibited the least amount of entropy generated by the Jitter RNG: the SiFive Unmatched RISC-V system. The measured entropy rate is well above the heuristically implied entropy value of 1 bit of entropy per time delta. On all other tested systems, the measured entropy rate is even higher by orders of magnitude. The measurement was performed using updated tooling provided with the user space Jitter RNG library test framework. The performance of the Jitter RNG with this patch is about en par with the performance of the Jitter RNG without the patch. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Back-port of commit bb897c55042e9330bcf88b4b13cbdd6f9fabdd5e Author: Stephan MΓΌller Date: Fri Apr 21 08:08:04 2023 +0200 Signed-off-by: Jeremy Allison --- crypto/Kconfig | 1 + crypto/jitterentropy-kcapi.c | 183 +++++++++++++++++++++++++++++++---- crypto/jitterentropy.c | 143 +++++++++------------------ crypto/jitterentropy.h | 10 +- 4 files changed, 218 insertions(+), 119 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 4591652cc086c..e9b9878043ea9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1777,6 +1777,7 @@ config CRYPTO_ANSI_CPRNG tristate "Pseudo Random Number Generation for Cryptographic modules" select CRYPTO_AES select CRYPTO_RNG + select CRYPTO_SHA3 help This option enables the generic pseudo random number generator for cryptographic modules. Uses the Algorithm specified in diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c index 5b39c0c6a7c31..e878ff7585349 100644 --- a/crypto/jitterentropy-kcapi.c +++ b/crypto/jitterentropy-kcapi.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Linux Kernel Crypto API specific code * - * Copyright Stephan Mueller , 2015 + * Copyright Stephan Mueller , 2015 - 2023 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +37,8 @@ * DAMAGE. */ +#include +#include #include #include #include @@ -46,6 +48,8 @@ #include "jitterentropy.h" +#define JENT_CONDITIONING_HASH "sha3-256-generic" + /*************************************************************************** * Helper function ***************************************************************************/ @@ -60,11 +64,6 @@ void jent_zfree(void *ptr) kfree_sensitive(ptr); } -void jent_memcpy(void *dest, const void *src, unsigned int n) -{ - memcpy(dest, src, n); -} - /* * Obtain a high-resolution time stamp value. The time stamp is used to measure * the execution time of a given code path and its variations. Hence, the time @@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out) *out = tmp; } +int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm); + u8 intermediary[SHA3_256_DIGEST_SIZE]; + __u64 j = 0; + int ret; + + desc->tfm = hash_state_desc->tfm; + + if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) { + pr_warn_ratelimited("Unexpected digest size\n"); + return -EINVAL; + } + + /* + * This loop fills a buffer which is injected into the entropy pool. + * The main reason for this loop is to execute something over which we + * can perform a timing measurement. The injection of the resulting + * data into the pool is performed to ensure the result is used and + * the compiler cannot optimize the loop away in case the result is not + * used at all. Yet that data is considered "additional information" + * considering the terminology from SP800-90A without any entropy. + * + * Note, it does not matter which or how much data you inject, we are + * interested in one Keccack1600 compression operation performed with + * the crypto_shash_final. + */ + for (j = 0; j < hash_loop_cnt; j++) { + ret = crypto_shash_init(desc) ?: + crypto_shash_update(desc, intermediary, + sizeof(intermediary)) ?: + crypto_shash_finup(desc, addtl, addtl_len, intermediary); + if (ret) + goto err; + } + + /* + * Inject the data from the previous loop into the pool. This data is + * not considered to contain any entropy, but it stirs the pool a bit. + */ + ret = crypto_shash_update(desc, intermediary, sizeof(intermediary)); + if (ret) + goto err; + + /* + * Insert the time stamp into the hash context representing the pool. + * + * If the time stamp is stuck, do not finally insert the value into the + * entropy pool. Although this operation should not do any harm even + * when the time stamp has no entropy, SP800-90B requires that any + * conditioning operation to have an identical amount of input data + * according to section 3.1.5. + */ + if (!stuck) { + ret = crypto_shash_update(hash_state_desc, (u8 *)&time, + sizeof(__u64)); + } + +err: + shash_desc_zero(desc); + memzero_explicit(intermediary, sizeof(intermediary)); + + return ret; +} + +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + u8 jent_block[SHA3_256_DIGEST_SIZE]; + /* Obtain data from entropy pool and re-initialize it */ + int ret = crypto_shash_final(hash_state_desc, jent_block) ?: + crypto_shash_init(hash_state_desc) ?: + crypto_shash_update(hash_state_desc, jent_block, + sizeof(jent_block)); + + if (!ret && dst_len) + memcpy(dst, jent_block, dst_len); + + memzero_explicit(jent_block, sizeof(jent_block)); + return ret; +} + /*************************************************************************** * Kernel crypto API interface ***************************************************************************/ @@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out) struct jitterentropy { spinlock_t jent_lock; struct rand_data *entropy_collector; + struct crypto_shash *tfm; + struct shash_desc *sdesc; }; -static int jent_kcapi_init(struct crypto_tfm *tfm) +static void jent_kcapi_cleanup(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); - int ret = 0; - rng->entropy_collector = jent_entropy_collector_alloc(1, 0); - if (!rng->entropy_collector) - ret = -ENOMEM; + spin_lock(&rng->jent_lock); - spin_lock_init(&rng->jent_lock); - return ret; -} + if (rng->sdesc) { + shash_desc_zero(rng->sdesc); + kfree(rng->sdesc); + } + rng->sdesc = NULL; -static void jent_kcapi_cleanup(struct crypto_tfm *tfm) -{ - struct jitterentropy *rng = crypto_tfm_ctx(tfm); + if (rng->tfm) + crypto_free_shash(rng->tfm); + rng->tfm = NULL; - spin_lock(&rng->jent_lock); if (rng->entropy_collector) jent_entropy_collector_free(rng->entropy_collector); rng->entropy_collector = NULL; spin_unlock(&rng->jent_lock); } +static int jent_kcapi_init(struct crypto_tfm *tfm) +{ + struct jitterentropy *rng = crypto_tfm_ctx(tfm); + struct crypto_shash *hash; + struct shash_desc *sdesc; + int size, ret = 0; + + spin_lock_init(&rng->jent_lock); + + /* + * Use SHA3-256 as conditioner. We allocate only the generic + * implementation as we are not interested in high-performance. The + * execution time of the SHA3 operation is measured and adds to the + * Jitter RNG's unpredictable behavior. If we have a slower hash + * implementation, the execution timing variations are larger. When + * using a fast implementation, we would need to call it more often + * as its variations are lower. + */ + hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(hash)) { + pr_err("Cannot allocate conditioning digest\n"); + return PTR_ERR(hash); + } + rng->tfm = hash; + + size = sizeof(struct shash_desc) + crypto_shash_descsize(hash); + sdesc = kmalloc(size, GFP_KERNEL); + if (!sdesc) { + ret = -ENOMEM; + goto err; + } + + sdesc->tfm = hash; + crypto_shash_init(sdesc); + rng->sdesc = sdesc; + + rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc); + if (!rng->entropy_collector) { + ret = -ENOMEM; + goto err; + } + + spin_lock_init(&rng->jent_lock); + return 0; + +err: + jent_kcapi_cleanup(tfm); + return ret; +} + static int jent_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) @@ -180,15 +314,24 @@ static struct rng_alg jent_alg = { .cra_module = THIS_MODULE, .cra_init = jent_kcapi_init, .cra_exit = jent_kcapi_cleanup, - } }; static int __init jent_mod_init(void) { + SHASH_DESC_ON_STACK(desc, tfm); + struct crypto_shash *tfm; int ret = 0; - ret = jent_entropy_init(); + tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + desc->tfm = tfm; + crypto_shash_init(desc); + ret = jent_entropy_init(desc); + shash_desc_zero(desc); + crypto_free_shash(tfm); if (ret) { /* Handle permanent health test error */ if (fips_enabled) diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index 1b0377e6efa0b..599c6b4d08215 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * - * Copyright Stephan Mueller , 2015 - 2020 + * Copyright Stephan Mueller , 2015 - 2023 * * Design * ====== @@ -57,21 +57,22 @@ typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; +typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { + /* SHA3-256 is used as conditioner */ +#define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ - __u64 data; /* SENSITIVE Actual random number */ - __u64 old_data; /* SENSITIVE Previous random number */ - __u64 prev_time; /* SENSITIVE Previous time stamp */ -#define DATA_SIZE_BITS ((sizeof(__u64)) * 8) - __u64 last_delta; /* SENSITIVE stuck test */ - __s64 last_delta2; /* SENSITIVE stuck test */ - unsigned int osr; /* Oversample rate */ + void *hash_state; /* SENSITIVE hash state entropy pool */ + __u64 prev_time; /* SENSITIVE Previous time stamp */ + __u64 last_delta; /* SENSITIVE stuck test */ + __s64 last_delta2; /* SENSITIVE stuck test */ + unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_BLOCKS 64 #define JENT_MEMORY_BLOCKSIZE 32 #define JENT_MEMORY_ACCESSLOOPS 128 @@ -285,15 +286,13 @@ static int jent_permanent_health_failure(struct rand_data *ec) * an entropy collection. * * Input: - * @ec entropy collector struct -- may be NULL * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ -static __u64 jent_loop_shuffle(struct rand_data *ec, - unsigned int bits, unsigned int min) +static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; @@ -301,12 +300,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, unsigned int mask = (1<data; + /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. @@ -328,81 +322,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, * execution time jitter * * This function injects the individual bits of the time value into the - * entropy pool using an LFSR. + * entropy pool using a hash. * - * The code is deliberately inefficient with respect to the bit shifting - * and shall stay that way. This function is the root cause why the code - * shall be compiled without optimization. This function not only acts as - * folding operation, but this function's execution is used to measure - * the CPU execution time jitter. Any change to the loop in this function - * implies that careful retesting must be done. - * - * @ec [in] entropy collector struct - * @time [in] time stamp to be injected - * @loop_cnt [in] if a value not equal to 0 is set, use the given value as - * number of loops to perform the folding - * @stuck [in] Is the time stamp identified as stuck? + * ec [in] entropy collector + * time [in] time stamp to be injected + * stuck [in] Is the time stamp identified as stuck? * * Output: - * updated ec->data - * - * @return Number of loops the folding operation is performed + * updated hash context in the entropy collector or error code */ -static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt, - int stuck) +static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { - unsigned int i; - __u64 j = 0; - __u64 new = 0; -#define MAX_FOLD_LOOP_BIT 4 -#define MIN_FOLD_LOOP_BIT 0 - __u64 fold_loop_cnt = - jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT); - - /* - * testing purposes -- allow test app to set the counter, not - * needed during runtime - */ - if (loop_cnt) - fold_loop_cnt = loop_cnt; - for (j = 0; j < fold_loop_cnt; j++) { - new = ec->data; - for (i = 1; (DATA_SIZE_BITS) >= i; i++) { - __u64 tmp = time << (DATA_SIZE_BITS - i); - - tmp = tmp >> (DATA_SIZE_BITS - 1); - - /* - * Fibonacci LSFR with polynomial of - * x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is - * primitive according to - * http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf - * (the shift values are the polynomial values minus one - * due to counting bits from 0 to 63). As the current - * position is always the LSB, the polynomial only needs - * to shift data in from the left without wrap. - */ - tmp ^= ((new >> 63) & 1); - tmp ^= ((new >> 60) & 1); - tmp ^= ((new >> 55) & 1); - tmp ^= ((new >> 30) & 1); - tmp ^= ((new >> 27) & 1); - tmp ^= ((new >> 22) & 1); - new <<= 1; - new ^= tmp; - } - } - - /* - * If the time stamp is stuck, do not finally insert the value into - * the entropy pool. Although this operation should not do any harm - * even when the time stamp has no entropy, SP800-90B requires that - * any conditioning operation (SP800-90B considers the LFSR to be a - * conditioning operation) to have an identical amount of input - * data according to section 3.1.5. - */ - if (!stuck) - ec->data = new; +#define SHA3_HASH_LOOP (1<<3) + struct { + int rct_count; + unsigned int apt_observations; + unsigned int apt_count; + unsigned int apt_base; + } addtl = { + ec->rct_count, + ec->apt_observations, + ec->apt_count, + ec->apt_base + }; + + return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), + SHA3_HASH_LOOP, stuck); } /* @@ -436,7 +381,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = - jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); + jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; @@ -504,14 +449,15 @@ static int jent_measure_jitter(struct rand_data *ec) stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ - jent_lfsr_time(ec, current_delta, 0, stuck); + if (jent_condition_data(ec, current_delta, stuck)) + stuck = 1; return stuck; } /* * Generator of one 64 bit random number - * Function fills rand_data->data + * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ @@ -555,7 +501,7 @@ static void jent_gen_entropy(struct rand_data *ec) * @return 0 when request is fulfilled or an error * * The following error codes can occur: - * -1 entropy_collector is NULL + * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ @@ -585,7 +531,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, * Perform startup health tests and return permanent * error if it fails. */ - if (jent_entropy_init()) + if (jent_entropy_init(ec->hash_state)) return -3; return -2; @@ -595,7 +541,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, tocopy = (DATA_SIZE_BITS / 8); else tocopy = len; - jent_memcpy(p, &ec->data, tocopy); + if (jent_read_random_block(ec->hash_state, p, tocopy)) + return -1; len -= tocopy; p += tocopy; @@ -609,7 +556,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags) + unsigned int flags, + void *hash_state) { struct rand_data *entropy_collector; @@ -636,6 +584,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, osr = 1; /* minimum sampling rate is 1 */ entropy_collector->osr = osr; + entropy_collector->hash_state = hash_state; + /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); @@ -649,7 +599,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector) jent_zfree(entropy_collector); } -int jent_entropy_init(void) +int jent_entropy_init(void *hash_state) { int i; __u64 delta_sum = 0; @@ -662,6 +612,7 @@ int jent_entropy_init(void) /* Required for RCT */ ec.osr = 1; + ec.hash_state = hash_state; /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These @@ -699,7 +650,7 @@ int jent_entropy_init(void) /* Invoke core entropy collection logic */ jent_get_nstime(&time); ec.prev_time = time; - jent_lfsr_time(&ec, time, 0, 0); + jent_condition_data(&ec, time, 0); jent_get_nstime(&time2); /* test whether timer works */ diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h index 5cc583f6bc6b8..b3890ff26a023 100644 --- a/crypto/jitterentropy.h +++ b/crypto/jitterentropy.h @@ -2,14 +2,18 @@ extern void *jent_zalloc(unsigned int len); extern void jent_zfree(void *ptr); -extern void jent_memcpy(void *dest, const void *src, unsigned int n); extern void jent_get_nstime(__u64 *out); +extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck); +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len); struct rand_data; -extern int jent_entropy_init(void); +extern int jent_entropy_init(void *hash_state); extern int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len); extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags); + unsigned int flags, + void *hash_state); extern void jent_entropy_collector_free(struct rand_data *entropy_collector); From f96633627cc95a68adf0c25c8b898b7eeeb6b21f Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Wed, 28 Aug 2024 14:51:11 -0700 Subject: [PATCH 02/50] crypto: aead,cipher - zeroize key buffer after use I.G 9.7.B for FIPS 140-3 specifies that variables temporarily holding cryptographic information should be zeroized once they are no longer needed. Accomplish this by using kfree_sensitive for buffers that previously held the private key. Signed-off-by: Hailey Mothershead Signed-off-by: Herbert Xu Back-ported from commit 23e4099bdc3c8381992f9eb975c79196d6755210 Author: Hailey Mothershead Date: Mon Apr 15 22:19:15 2024 +0000 Signed-off-by: Jeremy Allison --- crypto/aead.c | 3 +-- crypto/cipher.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/crypto/aead.c b/crypto/aead.c index 60b3bbe973e75..033aa5e9584c3 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -45,8 +45,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } diff --git a/crypto/cipher.c b/crypto/cipher.c index 57836c30a49a6..ba4193ba237bf 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -38,8 +38,7 @@ static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } From 9abcf8c131c08000297987db0bf783b9a9579492 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Thu, 29 Aug 2024 16:55:16 -0700 Subject: [PATCH 03/50] SUSE: patch: crypto-dh-implement-FIPS-PCT.patch Signed-off-by: Jeremy Allison --- crypto/dh.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/crypto/dh.c b/crypto/dh.c index 1bcb47c90055a..57851d7ad4ac0 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -218,10 +218,35 @@ static int dh_compute_value(struct kpp_request *req) /* SP800-56A rev 3 5.6.2.1.3 key check */ } else { + MPI val_pct; + if (dh_is_pubkey_valid(ctx, val)) { ret = -EAGAIN; goto err_free_val; } + + /* + * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance + * of Pair-wise Consistency"): recompute the + * public key and check if the results match. + */ + val_pct = mpi_alloc(0); + if (!val_pct) { + ret = -ENOMEM; + goto err_free_val; + } + + ret = _compute_val(ctx, base, val_pct); + if (ret) { + mpi_free(val_pct); + goto err_free_val; + } + + if (mpi_cmp(val, val_pct) != 0) { + mpi_free(val_pct); + panic("DH PCT failed in FIPS mode"); + } + mpi_free(val_pct); } } From b605d16a6a740e102690fb8a7b0bf5ef836ec604 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Thu, 29 Aug 2024 16:58:53 -0700 Subject: [PATCH 04/50] SUSE: patch: crypto-ecdh-implement-FIPS-PCT.patch Signed-off-by: Jeremy Allison --- crypto/ecdh.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/crypto/ecdh.c b/crypto/ecdh.c index 96f69cbbc38a3..01516551fc16f 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "ecc.h" struct ecdh_ctx { @@ -113,6 +114,36 @@ static int ecdh_compute_value(struct kpp_request *req) ctx->private_key, public_key); buf = public_key; nbytes = public_key_sz; + + /* + * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance of + * Pair-wise Consistency"): recompute the public key + * and check if the results match. + */ + if (fips_enabled) { + u64 *public_key_pct; + + if (ret < 0) + goto free_all; + + public_key_pct = kmalloc(public_key_sz, GFP_KERNEL); + if (!public_key_pct) { + ret = -ENOMEM; + goto free_all; + } + + ret = ecc_make_pub_key(ctx->curve_id, ctx->ndigits, + ctx->private_key, + public_key_pct); + if (ret < 0) { + kfree(public_key_pct); + goto free_all; + } + + if (memcmp(public_key, public_key_pct, public_key_sz)) + panic("ECDH PCT failed in FIPS mode"); + kfree(public_key_pct); + } } if (ret < 0) From c15d0569dd53fbe1764ecf2e326d0758377a6536 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Wed, 18 Sep 2024 09:09:00 -0700 Subject: [PATCH 05/50] crypto: jitter - add oversampling of noise source The output n bits can receive more than n bits of min entropy, of course, but the fixed output of the conditioning function can only asymptotically approach the output size bits of min entropy, not attain that bound. Random maps will tend to have output collisions, which reduces the creditable output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). The value "64" is justified in Appendix A.4 of the current 90C draft, and aligns with NIST's in "epsilon" definition in this document, which is that a string can be considered "full entropy" if you can bound the min entropy in each bit of output to at least 1-epsilon, where epsilon is required to be <= 2^(-32). Note, this patch causes the Jitter RNG to cut its performance in half in FIPS mode because the conditioning function of the LFSR produces 64 bits of entropy in one block. The oversampling requires that additionally 64 bits of entropy are sampled from the noise source. If the conditioner is changed, such as using SHA-256, the impact of the oversampling is only one fourth, because for the 256 bit block of the conditioner, only 64 additional bits from the noise source must be sampled. This patch is derived from the user space jitterentropy-library. Signed-off-by: Stephan Mueller Reviewed-by: Simo Sorce Signed-off-by: Herbert Xu Back-port of upstream commit 908dffaf88a248e542bdae3ca174f27b8f4ccf37. Signed-off-by: Jeremy Allison --- crypto/jitterentropy.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index 599c6b4d08215..c18fc9fd43d0a 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -119,6 +119,22 @@ struct rand_data { #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ +/* + * The output n bits can receive more than n bits of min entropy, of course, + * but the fixed output of the conditioning function can only asymptotically + * approach the output size bits of min entropy, not attain that bound. Random + * maps will tend to have output collisions, which reduces the creditable + * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). + * + * The value "64" is justified in Appendix A.4 of the current 90C draft, + * and aligns with NIST's in "epsilon" definition in this document, which is + * that a string can be considered "full entropy" if you can bound the min + * entropy in each bit of output to at least 1-epsilon, where epsilon is + * required to be <= 2^(-32). + */ +#define JENT_ENTROPY_SAFETY_FACTOR 64 + +#include #include "jitterentropy.h" /*************************************************************************** @@ -463,7 +479,10 @@ static int jent_measure_jitter(struct rand_data *ec) */ static void jent_gen_entropy(struct rand_data *ec) { - unsigned int k = 0; + unsigned int k = 0, safety_factor = 0; + + if (fips_enabled) + safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec); @@ -477,7 +496,7 @@ static void jent_gen_entropy(struct rand_data *ec) * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ - if (++k >= (DATA_SIZE_BITS * ec->osr)) + if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } From 5f28be389909506e3cad5120dd91c4ee3663626d Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Mon, 23 Sep 2024 12:40:15 -0700 Subject: [PATCH 06/50] crypto: ecdh - explicitly zeroize private_key private_key is overwritten with the key parameter passed in by the caller (if present), or alternatively a newly generated private key. However, it is possible that the caller provides a key (or the newly generated key) which is shorter than the previous key. In that scenario, some key material from the previous key would not be overwritten. The easiest solution is to explicitly zeroize the entire private_key array first. Note that this patch slightly changes the behavior of this function: previously, if the ecc_gen_privkey failed, the old private_key would remain. Now, the private_key is always zeroized. This behavior is consistent with the case where params.key is set and ecc_is_key_valid fails. Signed-off-by: Joachim Vandersmissen Signed-off-by: Herbert Xu Back-port of upstream commit: 73e5984e540a76a2ee1868b91590c922da8c24c9 Signed-off-by: Jeremy Allison --- crypto/ecdh.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/ecdh.c b/crypto/ecdh.c index 01516551fc16f..c5e5e0c79235a 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -54,6 +54,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, ctx->curve_id = params.curve_id; ctx->ndigits = ndigits; + memset(ctx->private_key, 0, sizeof(ctx->private_key)); + if (!params.key || !params.key_size) return ecc_gen_privkey(ctx->curve_id, ctx->ndigits, ctx->private_key); From 0ae46e4c63ef9c5b5b54110df284988d102018a9 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Mon, 23 Sep 2024 12:47:07 -0700 Subject: [PATCH 07/50] KEYS: use kfree_sensitive with key key might contain private part of the key, so better use kfree_sensitive to free it Signed-off-by: Mahmoud Adam Signed-off-by: Herbert Xu Back-port of upstream commit: 9f3fa6bc4ff8515da1349c44a77e7327bd2f4788 Signed-off-by: Jeremy Allison --- crypto/asymmetric_keys/public_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 4a967b545f4ab..c051916775a0c 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -139,7 +139,7 @@ int public_key_verify_signature(const struct public_key *pkey, ret = -EKEYREJECTED; out_free_output: - kfree(output); + kfree_sensitive(output); error_free_req: akcipher_request_free(req); error_free_tfm: From 6acce487a85b920c19179cab880a57654a739caf Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Mon, 23 Sep 2024 12:55:09 -0700 Subject: [PATCH 08/50] In essiv_aead_setkey(), use the same logic as crypto_authenc_esn_setkey() to zeroize keys on exit. Signed-off-by: Jeremy Allison --- crypto/essiv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/essiv.c b/crypto/essiv.c index a8befc8fb06ed..e1707538392e3 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -127,7 +127,7 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?: crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt); if (err) - return err; + goto out; crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & @@ -137,6 +137,8 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_aead_set_flags(tfm, crypto_cipher_get_flags(tctx->essiv_cipher) & CRYPTO_TFM_RES_MASK); +out: + memzero_explicit(&keys, sizeof(keys)); return err; } From 6cd2f2aef39a8d0fda499b0304afed1029df90e0 Mon Sep 17 00:00:00 2001 From: Greg Rose Date: Tue, 22 Oct 2024 09:50:54 -0700 Subject: [PATCH 09/50] github actions: Incorporate feedback on workflows Add workflows for pushes and pull requests. Signed-off-by: Greg Rose --- .github/workflows/diffdiff.py | 129 +++++++++++++++++++ .github/workflows/github-actions-demo.yml | 26 ++++ .github/workflows/process-git-request.rb | 140 +++++++++++++++++++++ .github/workflows/process-pull-request.yml | 55 ++++++++ .github/workflows/push-check_aarch64.yml | 33 +++++ .github/workflows/push-check_x86_64.yml | 33 +++++ 6 files changed, 416 insertions(+) create mode 100755 .github/workflows/diffdiff.py create mode 100644 .github/workflows/github-actions-demo.yml create mode 100644 .github/workflows/process-git-request.rb create mode 100644 .github/workflows/process-pull-request.yml create mode 100644 .github/workflows/push-check_aarch64.yml create mode 100644 .github/workflows/push-check_x86_64.yml diff --git a/.github/workflows/diffdiff.py b/.github/workflows/diffdiff.py new file mode 100755 index 0000000000000..dc2c5ab0d1e9d --- /dev/null +++ b/.github/workflows/diffdiff.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# + +import argparse +import copy +import difflib +import io +import git +import os +import re +import subprocess +import sys +import tempfile + +verbose = False + + +def get_upstream_commit(upstream, c): + for l in c.message.splitlines(): + try: + sha = re.match('\s*commit\s+(?P\S+)', l).groups()[0].upper() + return upstream.commit(sha) + except: + True + +def get_diff(d): + dif = '' + df = False + for l in d.splitlines(): + if l[:10] == 'diff --git': + df = True + if not df: + continue + dif = dif + l + '\n' + return dif + + +def trim_unchanged_files(lines): + dl = [] + ld = 0 # Last line with a 'diff --git' we saw + hd = False # Have we seen a changed line since ld? + i = 0 + for i, l in enumerate(lines): + if l[:4] == '+++ ' or l[:4] == '--- ' : + continue + if l[0] == '+' or l[0] == '-': + hd = True + if l[:11] == ' diff --git': + if ld: # We are at a new diff now, last one started at 'ld' + if not hd: + dl.insert(0, (ld, i+1),) + ld = i + hd = False # Reset hasdiff to False as we start a new section + # and check the tail + if not hd: + dl.insert(0, (ld, i+1),) + # delete the unchanged file sections + for d in dl: + del lines[d[0]:d[1]] + return lines + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-v', action='store_true', help='Verbose') + parser.add_argument('--colour', action='store_true', help='Colorize the diff. Green for additions, red for deletions') + parser.add_argument('--commit', help='Commit in current tree to diffdiff. Default is the most recent commit.') + parser.add_argument('--upstream', help='A directory that contains the current upstream of linus kernel tree where we can find the commits we reference. Default is the current repo') + args = parser.parse_args() + + + if args.v: + verbose = True + + srcgit = git.Repo.init('.') + upstream = git.Repo.init(args.upstream) + c = srcgit.head.commit if not args.commit else srcgit.commit(args.commit) + uc = get_upstream_commit(upstream, c) + + dc = get_diff(srcgit.git.show(c)) + duc = get_diff(upstream.git.show(uc)) + + with open('c.diff', 'w') as f: + f.write(dc) + with open('u.diff', 'w') as f: + f.write(duc) + + res = subprocess.run(['diff', '-u', 'u.diff', 'c.diff'], + check=False, stdout=subprocess.PIPE) + lines = res.stdout.splitlines() + dd = [] + for l in lines: + l = str(l)[2:-1] + if l[:6] == '-index': + continue + if l[:6] == '+index': + continue + if l[:3] == '-@@': + continue + if l[:3] == '+@@': + dd.append(' ' + l[1:]) + continue + dd.append(l) + + # trim diffs for files that did not change + lines = trim_unchanged_files(dd) + + # colorize the diff + diffs = 0 + if args.colour: + dd = [] + for l in lines: + if l[0:4] != '+++ ' and l[0:4] != '--- ': + if l[0] == '+': + l = '\033[42m' + l + '\033[0m' + diffs = diffs + 1 + if l[0] == '-': + l = '\033[41m' + l + '\033[0m' + diffs = diffs + 1 + dd.append(l) + lines = dd + + + if diffs: + for l in lines: + print(l) + + sys.exit(diffs) diff --git a/.github/workflows/github-actions-demo.yml b/.github/workflows/github-actions-demo.yml new file mode 100644 index 0000000000000..de3dbc4d34b9b --- /dev/null +++ b/.github/workflows/github-actions-demo.yml @@ -0,0 +1,26 @@ +name: GitHub Actions Sanity Check +run-name: ${{ github.actor }} is running actions - this runs as a sanity check πŸš€ +on: + push: + branches: + - '**' + - '!mainline' + +jobs: + Explore-GitHub-Actions: + runs-on: ubuntu-latest + steps: + - run: echo "πŸŽ‰ The job was automatically triggered by a ${{ github.event_name }} event." + - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" + - run: echo "πŸ”Ž The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." + - name: Check out repository code + uses: actions/checkout@v4 + - run: echo "πŸ’‘ The ${{ github.repository }} repository has been cloned to the runner." + - run: echo "πŸ–₯️ The workflow is now ready to test your code on the runner." + - name: List files in the repository + run: | + ls ${{ github.workspace }} + df . + df / + pwd + - run: echo "🍏 This job's status is ${{ job.status }}." diff --git a/.github/workflows/process-git-request.rb b/.github/workflows/process-git-request.rb new file mode 100644 index 0000000000000..04a2ccd49b8bc --- /dev/null +++ b/.github/workflows/process-git-request.rb @@ -0,0 +1,140 @@ +require 'open3' + +requestors = { "gvrose8192" => "" } + +def file_prepend(file, str) + new_contents = "" + File.open(file, 'r') do |fd| + contents = fd.read + new_contents = str << contents + end + # Overwrite file but now with prepended string on it + File.open(file, 'w') do |fd| + fd.write(new_contents) + end +end + +def process_git_request(fname, target_branch, source_branch, prj_dir) + retcode = 200 #presume success +# puts "Opening file " + fname + file = File.new(fname, "w") + working_dir = prj_dir +# puts "Working Dir : " + working_dir + Dir.chdir working_dir +# puts "pwd : " + Dir.pwd + git_cmd = "git log --oneline --no-abbrev-commit origin/" + target_branch + ".." + "origin/" + source_branch +# puts git_cmd + out, err, status = Open3.capture3(git_cmd) + if status.exitstatus != 0 + puts "Command error output is " + err + file.write("Command error output is " + err) + file.close + retcode = 201 + return retcode + end + output_lines = out.split(' ') +# we just want the commit sha IDs + output_lines.each { |x| +# puts "This is output_lines " + x + upstream_diff = false + if !x[/\H/] + if x.length < 40 + next + end + git_cmd = "git show " + x + gitlog_out, gitlog_err, gitlog_status = Open3.capture3(git_cmd) + if gitlog_status.exitstatus != 0 + file.write("git show command error output is " + gitlog_err) + retcode = 201 + end + loglines = gitlog_out.lines.map(&:chomp) + lines_counted = 0 + local_diffdiff_sha = "" + upstream_diffdiff_sha = "" + loglines.each { |logline| + lines_counted = lines_counted + 1 + if lines_counted == 1 + local_commit_sha = logline.match("[0-9a-f]\{40\}") + local_diffdiff_sha = local_commit_sha.to_s +# puts "Local : " + local_diffdiff_sha + file.write("Merge Request sha: " + local_diffdiff_sha) + file.write("\n") + end + if lines_counted == 2 #email address + if !logline.downcase.include? "ciq.com" + # Bad Author + s = "error:\nBad " + logline + "\n" + puts s + file.write(s) + retcode = 201 + else + file.write("\t" + logline + "\n") + end + end + if lines_counted > 1 + if logline.downcase.include? "jira" + file.write("\t" + logline + "\n") + end + if logline.downcase.include? "upstream-diff" + upstream_diff = true + end + if logline.downcase.include? "commit" + commit_sha = logline.match("[0-9a-f]\{40\}") + upstream_diffdiff_sha = commit_sha.to_s +# puts "Upstream : " + upstream_diffdiff_sha + if (!upstream_diffdiff_sha.empty?) + file.write("\tUpstream sha: " + upstream_diffdiff_sha) + file.write("\n") + end + end + end + if lines_counted > 8 #Everything we need should be in the first 8 lines + break + end + } + if !local_diffdiff_sha.empty? && !upstream_diffdiff_sha.empty? + diff_cmd = Dir.pwd + "/.github/workflows/diffdiff.py --colour --commit " + local_diffdiff_sha + puts "diffdiff: " + diff_cmd + diff_out, diff_err, diff_status = Open3.capture3(diff_cmd) + if diff_status.exitstatus != 0 && !upstream_diff + puts "diffdiff out: " + diff_out + puts "diffdiff err: " + diff_err + retcode = 201 + file.write("error:\nCommit: " + local_diffdiff_sha + " differs with no upstream tag in commit message\n") + end + end + end + } + file.close + return retcode +end + +first_arg, *argv_in = ARGV +if argv_in.length < 5 + puts "Not enough arguments: fname, target_branch, source_branch, prj_dir, pull_request, requestor" + exit +end +fname = first_arg.to_s +fname = "tmp-" + fname +# puts "filename is " + fname +target_branch = argv_in[0].to_s +# puts "target branch is " + target_branch +source_branch = argv_in[1].to_s +# puts "source branch is " + source_branch +prj_dir = argv_in[2].to_s +# puts "project dir is " + prj_dir +pullreq = argv_in[3].to_s +# puts "pull request is " + pullreq +requestor = argv_in[4].to_s +retcode = process_git_request(fname, target_branch, source_branch, prj_dir) +if retcode != 200 + File.open(fname, 'r') do |fd| + contents = fd.read + puts contents + end + exit(1) +else + puts "Done" +end +exit(0) + diff --git a/.github/workflows/process-pull-request.yml b/.github/workflows/process-pull-request.yml new file mode 100644 index 0000000000000..a4f9f43fa425e --- /dev/null +++ b/.github/workflows/process-pull-request.yml @@ -0,0 +1,55 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Pull Request Checker + +on: + pull_request: + branches: + - '**' + - '!mainline' + +permissions: + contents: read + +jobs: + test: + + runs-on: + labels: kernel-build + strategy: + matrix: + ruby-version: ['3.0'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Ruby + # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby, + # change this to (see https://github.com/ruby/setup-ruby#versioning): + uses: ruby/setup-ruby@v1 + # uses: ruby/setup-ruby@55283cc23133118229fd3f97f9336ee23a179fcf # v1.146.0 + with: + ruby-version: ${{ matrix.ruby-version }} + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + - name: Set up Python + uses: actions/setup-python@v5 + - name: Run tests + run: | + /usr/bin/pip3 install gitPython + python -c "import sys; import git; print(sys.version)" + git fetch origin ${{ github.base_ref }} + git fetch origin ${{ github.head_ref }} + git remote add linux https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git + git fetch --shallow-since="3 years ago" linux + echo "Will run process-git-request.rb with:" + echo "fname = ${{ github.run_id }}" + echo "target_branch = ${{ github.base_ref }}" + echo "source_branch = ${{ github.head_ref }}" + echo "prj_dir = ${{ github.workspace }}" + echo "pull_request = ${{ github.ref }}" + echo "requestor = ${{ github.actor }}" + cd ${{ github.workspace }} + /usr/bin/ruby .github/workflows/process-git-request.rb ${{ github.run_id }} ${{ github.base_ref }} \ + ${{ github.head_ref }} ${{ github.workspace }} ${{ github.ref }} ${{ github.actor }} diff --git a/.github/workflows/push-check_aarch64.yml b/.github/workflows/push-check_aarch64.yml new file mode 100644 index 0000000000000..2dda81c43aa79 --- /dev/null +++ b/.github/workflows/push-check_aarch64.yml @@ -0,0 +1,33 @@ +name: CI +on: + push: + branches: + - '**' + - '!mainline' + +jobs: + kernel-build-job: + runs-on: + labels: kernel-build-arm64 + container: + image: rockylinux:8 + env: + ROCKY_ENV: rocky8 + ports: + - 80 + options: --cpus 8 + steps: + - name: Install tools and Libraries + run: | + dnf groupinstall 'Development Tools' -y + dnf install --enablerepo=devel bc dwarves kernel-devel openssl-devel elfutils-libelf-devel -y + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Build the Kernel + run: | + git config --global --add safe.directory /__w/kernel-src-git/kernel-src-git + cp configs/kernel-4.18.0-aarch64.config .config + make olddefconfig + make -j8 diff --git a/.github/workflows/push-check_x86_64.yml b/.github/workflows/push-check_x86_64.yml new file mode 100644 index 0000000000000..2aa1eb2ed4f18 --- /dev/null +++ b/.github/workflows/push-check_x86_64.yml @@ -0,0 +1,33 @@ +name: CI +on: + push: + branches: + - '**' + - '!mainline' + +jobs: + kernel-build-job: + runs-on: + labels: kernel-build + container: + image: rockylinux:8 + env: + ROCKY_ENV: rocky8 + ports: + - 80 + options: --cpus 8 + steps: + - name: Install tools and Libraries + run: | + dnf groupinstall 'Development Tools' -y + dnf install --enablerepo=devel bc dwarves kernel-devel openssl-devel elfutils-libelf-devel -y + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Build the Kernel + run: | + git config --global --add safe.directory /__w/kernel-src-git/kernel-src-git + cp configs/kernel-4.18.0-x86_64.config .config + make olddefconfig + make -j8 From 2ef5ea633c77f366d548bf8cf2c55b95db78a02f Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Mon, 14 Apr 2025 13:06:44 -0400 Subject: [PATCH 10/50] configs: x86_64: Sync with dist-git LE-2786 Sync kernel-x86_64.config with el86-fips-compliant-8 branch from internal dist-git. Same as shipped src.rpm. Signed-off-by: Jonathan Maple --- configs/kernel-x86_64.config | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/configs/kernel-x86_64.config b/configs/kernel-x86_64.config index 626a6d72a64f0..af22210c4aadb 100644 --- a/configs/kernel-x86_64.config +++ b/configs/kernel-x86_64.config @@ -316,12 +316,10 @@ # CONFIG_CRYPTO_AEGIS128_AESNI_SSE2 is not set # CONFIG_CRYPTO_AEGIS256 is not set # CONFIG_CRYPTO_AEGIS256_AESNI_SSE2 is not set -# CONFIG_CRYPTO_AES_TI is not set # CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set # CONFIG_CRYPTO_DEV_CCREE is not set # CONFIG_CRYPTO_DEV_CHELSIO_TLS is not set # CONFIG_CRYPTO_DEV_VIRTIO is not set -# CONFIG_CRYPTO_KEYWRAP is not set # CONFIG_CRYPTO_LZ4 is not set # CONFIG_CRYPTO_LZ4HC is not set # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set @@ -2676,6 +2674,7 @@ CONFIG_CROSS_MEMORY_ATTACH=y CONFIG_CRYPTO=y CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_NI_INTEL=y +CONFIG_CRYPTO_AES_TI=y CONFIG_CRYPTO_AES_X86_64=y CONFIG_CRYPTO_ANSI_CPRNG=m CONFIG_CRYPTO_ANUBIS=m @@ -2745,6 +2744,7 @@ CONFIG_CRYPTO_GHASH=y CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_KEYWRAP=y CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_MANAGER=y @@ -2768,7 +2768,7 @@ CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA1_SSSE3=y CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA256_SSSE3=y -CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SHA3=y CONFIG_CRYPTO_SHA512=y CONFIG_CRYPTO_SHA512_SSSE3=y CONFIG_CRYPTO_TEA=m @@ -5851,3 +5851,5 @@ CONFIG_ZRAM_WRITEBACK=y CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_ZSWAP=y +CONFIG_CRYPTO_FIPS_CUSTOM_VERSION=y +CONFIG_CRYPTO_FIPS_VERSION="rocky8.20240923" From da4d13c989e09813c740b942534190e88d1c4d14 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Mon, 21 Apr 2025 13:39:01 -0400 Subject: [PATCH 11/50] github actions: Remove demo job Signed-off-by: Jonathan Maple --- .github/workflows/github-actions-demo.yml | 26 ----------------------- 1 file changed, 26 deletions(-) delete mode 100644 .github/workflows/github-actions-demo.yml diff --git a/.github/workflows/github-actions-demo.yml b/.github/workflows/github-actions-demo.yml deleted file mode 100644 index de3dbc4d34b9b..0000000000000 --- a/.github/workflows/github-actions-demo.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: GitHub Actions Sanity Check -run-name: ${{ github.actor }} is running actions - this runs as a sanity check πŸš€ -on: - push: - branches: - - '**' - - '!mainline' - -jobs: - Explore-GitHub-Actions: - runs-on: ubuntu-latest - steps: - - run: echo "πŸŽ‰ The job was automatically triggered by a ${{ github.event_name }} event." - - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" - - run: echo "πŸ”Ž The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." - - name: Check out repository code - uses: actions/checkout@v4 - - run: echo "πŸ’‘ The ${{ github.repository }} repository has been cloned to the runner." - - run: echo "πŸ–₯️ The workflow is now ready to test your code on the runner." - - name: List files in the repository - run: | - ls ${{ github.workspace }} - df . - df / - pwd - - run: echo "🍏 This job's status is ${{ job.status }}." From 32d0bf49e4ec6fd34ae153562a2d3bd3e2e2daa6 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Mon, 21 Apr 2025 13:36:31 -0400 Subject: [PATCH 12/50] github actions: Remove push checks We run build checks on pull requests now instead of push Signed-off-by: Jonathan Maple --- .github/workflows/push-check_aarch64.yml | 33 ------------------------ .github/workflows/push-check_x86_64.yml | 33 ------------------------ 2 files changed, 66 deletions(-) delete mode 100644 .github/workflows/push-check_aarch64.yml delete mode 100644 .github/workflows/push-check_x86_64.yml diff --git a/.github/workflows/push-check_aarch64.yml b/.github/workflows/push-check_aarch64.yml deleted file mode 100644 index 2dda81c43aa79..0000000000000 --- a/.github/workflows/push-check_aarch64.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: CI -on: - push: - branches: - - '**' - - '!mainline' - -jobs: - kernel-build-job: - runs-on: - labels: kernel-build-arm64 - container: - image: rockylinux:8 - env: - ROCKY_ENV: rocky8 - ports: - - 80 - options: --cpus 8 - steps: - - name: Install tools and Libraries - run: | - dnf groupinstall 'Development Tools' -y - dnf install --enablerepo=devel bc dwarves kernel-devel openssl-devel elfutils-libelf-devel -y - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Build the Kernel - run: | - git config --global --add safe.directory /__w/kernel-src-git/kernel-src-git - cp configs/kernel-4.18.0-aarch64.config .config - make olddefconfig - make -j8 diff --git a/.github/workflows/push-check_x86_64.yml b/.github/workflows/push-check_x86_64.yml deleted file mode 100644 index 2aa1eb2ed4f18..0000000000000 --- a/.github/workflows/push-check_x86_64.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: CI -on: - push: - branches: - - '**' - - '!mainline' - -jobs: - kernel-build-job: - runs-on: - labels: kernel-build - container: - image: rockylinux:8 - env: - ROCKY_ENV: rocky8 - ports: - - 80 - options: --cpus 8 - steps: - - name: Install tools and Libraries - run: | - dnf groupinstall 'Development Tools' -y - dnf install --enablerepo=devel bc dwarves kernel-devel openssl-devel elfutils-libelf-devel -y - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Build the Kernel - run: | - git config --global --add safe.directory /__w/kernel-src-git/kernel-src-git - cp configs/kernel-4.18.0-x86_64.config .config - make olddefconfig - make -j8 From 3698a6a611ed1fe09ce5f914c107f1e190c34e1c Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Mon, 11 Aug 2025 09:53:11 -0400 Subject: [PATCH 13/50] github actions: Add upstream commit checker LE-3770 This github action checks the PR commits for references to upstream linux commits (lines starting with "commit ") and does two things: 1. Checks that this hash exists in the upstream linux kernel history 2. Checks if there are any Fixes: references for the referenced commit in the upstream linux kernel history If either of those are found to be true a comment is added to the PR with the pertinent information. The logic for the check is provided by the check_upstream_commits.py script from kernel-src-tree-tools Signed-off-by: Jonathan Maple --- .github/workflows/upstream-commit-check.yml | 54 +++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .github/workflows/upstream-commit-check.yml diff --git a/.github/workflows/upstream-commit-check.yml b/.github/workflows/upstream-commit-check.yml new file mode 100644 index 0000000000000..ae25072b95223 --- /dev/null +++ b/.github/workflows/upstream-commit-check.yml @@ -0,0 +1,54 @@ +name: Check Kernel Commits for Upstream Fixes + +on: + pull_request: + types: [opened, synchronize, reopened] + +permissions: + contents: read + pull-requests: write + +jobs: + check-upstream-fixes: + runs-on: ubuntu-latest + + steps: + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.head_ref }} + + - name: Checkout base branch + run: | + git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + + - name: Download check_kernel_commits.py + run: | + curl -sL \ + https://raw.githubusercontent.com/ctrliq/kernel-src-tree-tools/mainline/check_kernel_commits.py \ + -o check_kernel_commits.py + chmod +x check_kernel_commits.py + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Run upstream fixes check + id: checkkernel + run: | + python3 check_kernel_commits.py --repo . --pr_branch "${{ github.head_ref }}" --base_branch "${{ github.base_ref }}" --markdown | tee result.txt + # Save non-empty results for PR comment + if grep -q -v "All referenced commits exist upstream and have no Fixes: tags." result.txt; then + echo "has_findings=true" >> $GITHUB_OUTPUT + fi + + - name: Comment on PR if issues found + if: steps.checkkernel.outputs.has_findings == 'true' + env: + GH_TOKEN: ${{ github.token }} + run: | + gh pr comment ${{ github.event.pull_request.number }} \ + --body "$(cat result.txt)" \ + --repo ${{ github.repository }} From aef9f7d6c5327a92cbf4c2fb8d52910f84049d98 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Fri, 1 Aug 2025 10:55:26 -0400 Subject: [PATCH 14/50] github actions: Add kabi checks LE-3799 After the build check, perform a kabi check Signed-off-by: Jonathan Maple --- .github/workflows/build-check_aarch64.yml | 5 +++++ .github/workflows/build-check_x86_64.yml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/build-check_aarch64.yml b/.github/workflows/build-check_aarch64.yml index e9b915067c02e..f9af175c4659e 100644 --- a/.github/workflows/build-check_aarch64.yml +++ b/.github/workflows/build-check_aarch64.yml @@ -32,3 +32,8 @@ jobs: cp configs/kernel-aarch64.config .config make olddefconfig make -j8 + - name: Check kabi + run: | + git clone --branch r8 --single-branch https://git.rockylinux.org/staging/rpms/kernel.git kernel-dist-git + git -C kernel-dist-git reset --hard imports/r8/kernel-4.18.0-553.16.1.el8_10 + ./kernel-dist-git/SOURCES/check-kabi -k ./kernel-dist-git/SOURCES/Module.kabi_aarch64 -s Module.symvers diff --git a/.github/workflows/build-check_x86_64.yml b/.github/workflows/build-check_x86_64.yml index 033208cc7fdf1..18f4dc7459a5a 100644 --- a/.github/workflows/build-check_x86_64.yml +++ b/.github/workflows/build-check_x86_64.yml @@ -32,3 +32,8 @@ jobs: cp configs/kernel-x86_64.config .config make olddefconfig make -j8 + - name: Check kabi + run: | + git clone --branch r8 --single-branch https://git.rockylinux.org/staging/rpms/kernel.git kernel-dist-git + git -C kernel-dist-git reset --hard imports/r8/kernel-4.18.0-553.16.1.el8_10 + ./kernel-dist-git/SOURCES/check-kabi -k ./kernel-dist-git/SOURCES/Module.kabi_x86_64 -s Module.symvers From 0d9fc2424c175179e3b32e97ce09545a5f92522f Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 12 Aug 2025 16:42:02 -0400 Subject: [PATCH 15/50] github actions: Fix upstream commit check for forks The upstream commit check workflow was failing for pull requests originating from forked repositories. The previous implementation incorrectly assumed the pull request branch existed on the base repository. This commit corrects the workflow to ensure the pull request branch is checked out from the correct source repository, while the base branch is fetched from the target repository. Signed-off-by: Jonathan Maple --- .github/workflows/upstream-commit-check.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/upstream-commit-check.yml b/.github/workflows/upstream-commit-check.yml index ae25072b95223..e95c4e904f8e4 100644 --- a/.github/workflows/upstream-commit-check.yml +++ b/.github/workflows/upstream-commit-check.yml @@ -16,12 +16,14 @@ jobs: - name: Checkout PR branch uses: actions/checkout@v4 with: + repository: ${{ github.event.pull_request.head.repo.full_name }} fetch-depth: 0 ref: ${{ github.head_ref }} - name: Checkout base branch run: | - git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + git remote add base_repo https://github.com/${{ github.repository }}.git + git fetch base_repo ${{ github.base_ref }}:${{ github.base_ref }} - name: Download check_kernel_commits.py run: | From 5b391f301b6ea883a1d948ccfc793af0e9a197c3 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 2 Sep 2025 14:49:31 -0400 Subject: [PATCH 16/50] github actions: Fix process-pull-request for forks The process-pull-request workflow was failing for pull requests originating from forked repositories. The previous implementation incorrectly assumed the pull request branch existed on the base repository. This commit corrects the workflow to ensure the pull request branch is checked out from the correct source repository, while the base branch is fetched from the target repository. Signed-off-by: Jonathan Maple --- .github/workflows/process-git-request.rb | 2 +- .github/workflows/process-pull-request.yml | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/process-git-request.rb b/.github/workflows/process-git-request.rb index 04a2ccd49b8bc..9be1869d51087 100644 --- a/.github/workflows/process-git-request.rb +++ b/.github/workflows/process-git-request.rb @@ -22,7 +22,7 @@ def process_git_request(fname, target_branch, source_branch, prj_dir) # puts "Working Dir : " + working_dir Dir.chdir working_dir # puts "pwd : " + Dir.pwd - git_cmd = "git log --oneline --no-abbrev-commit origin/" + target_branch + ".." + "origin/" + source_branch + git_cmd = "git log --oneline --no-abbrev-commit base_repo/" + target_branch + ".." + "origin/" + source_branch # puts git_cmd out, err, status = Open3.capture3(git_cmd) if status.exitstatus != 0 diff --git a/.github/workflows/process-pull-request.yml b/.github/workflows/process-pull-request.yml index a4f9f43fa425e..4187005d147a4 100644 --- a/.github/workflows/process-pull-request.yml +++ b/.github/workflows/process-pull-request.yml @@ -9,7 +9,7 @@ on: pull_request: branches: - '**' - - '!mainline' + - '!mainline' permissions: contents: read @@ -24,7 +24,18 @@ jobs: ruby-version: ['3.0'] steps: - - uses: actions/checkout@v4 + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + repository: ${{ github.event.pull_request.head.repo.full_name }} + fetch-depth: 0 + ref: ${{ github.head_ref }} + + - name: Checkout base branch + run: | + git remote add base_repo https://github.com/${{ github.repository }}.git + git fetch base_repo ${{ github.base_ref }}:${{ github.base_ref }} + - name: Set up Ruby # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby, # change this to (see https://github.com/ruby/setup-ruby#versioning): @@ -39,8 +50,6 @@ jobs: run: | /usr/bin/pip3 install gitPython python -c "import sys; import git; print(sys.version)" - git fetch origin ${{ github.base_ref }} - git fetch origin ${{ github.head_ref }} git remote add linux https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git git fetch --shallow-since="3 years ago" linux echo "Will run process-git-request.rb with:" From a6d4021a20d654f01d97782d4868a7aa185092a0 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 13 Nov 2025 12:00:41 -0500 Subject: [PATCH 17/50] github actions: remove old pr checker There will be a new PR checker inbound soon this one is just broken so removing it. Signed-off-by: Roxana Nicolescu --- .github/workflows/process-pull-request.yml | 64 ---------------------- 1 file changed, 64 deletions(-) delete mode 100644 .github/workflows/process-pull-request.yml diff --git a/.github/workflows/process-pull-request.yml b/.github/workflows/process-pull-request.yml deleted file mode 100644 index 4187005d147a4..0000000000000 --- a/.github/workflows/process-pull-request.yml +++ /dev/null @@ -1,64 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Pull Request Checker - -on: - pull_request: - branches: - - '**' - - '!mainline' - -permissions: - contents: read - -jobs: - test: - - runs-on: - labels: kernel-build - strategy: - matrix: - ruby-version: ['3.0'] - - steps: - - name: Checkout PR branch - uses: actions/checkout@v4 - with: - repository: ${{ github.event.pull_request.head.repo.full_name }} - fetch-depth: 0 - ref: ${{ github.head_ref }} - - - name: Checkout base branch - run: | - git remote add base_repo https://github.com/${{ github.repository }}.git - git fetch base_repo ${{ github.base_ref }}:${{ github.base_ref }} - - - name: Set up Ruby - # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby, - # change this to (see https://github.com/ruby/setup-ruby#versioning): - uses: ruby/setup-ruby@v1 - # uses: ruby/setup-ruby@55283cc23133118229fd3f97f9336ee23a179fcf # v1.146.0 - with: - ruby-version: ${{ matrix.ruby-version }} - bundler-cache: true # runs 'bundle install' and caches installed gems automatically - - name: Set up Python - uses: actions/setup-python@v5 - - name: Run tests - run: | - /usr/bin/pip3 install gitPython - python -c "import sys; import git; print(sys.version)" - git remote add linux https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git - git fetch --shallow-since="3 years ago" linux - echo "Will run process-git-request.rb with:" - echo "fname = ${{ github.run_id }}" - echo "target_branch = ${{ github.base_ref }}" - echo "source_branch = ${{ github.head_ref }}" - echo "prj_dir = ${{ github.workspace }}" - echo "pull_request = ${{ github.ref }}" - echo "requestor = ${{ github.actor }}" - cd ${{ github.workspace }} - /usr/bin/ruby .github/workflows/process-git-request.rb ${{ github.run_id }} ${{ github.base_ref }} \ - ${{ github.head_ref }} ${{ github.workspace }} ${{ github.ref }} ${{ github.actor }} From af333765e4e87f95025fb6f6222d021cc8f1cd6a Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Fri, 17 Oct 2025 12:52:16 -0400 Subject: [PATCH 18/50] github actions: Use reusable validate kernel commits workflow Simplifies the workflow to use the reusable workflow defined in main branch. This reduces duplication and makes the workflow easier to maintain across multiple branches. The workflow was renamed because it now includes validation over and above just checking for upstream fixes Signed-off-by: Roxana Nicolescu --- .github/workflows/upstream-commit-check.yml | 56 ------------------- .github/workflows/validate-kernel-commits.yml | 10 ++++ 2 files changed, 10 insertions(+), 56 deletions(-) delete mode 100644 .github/workflows/upstream-commit-check.yml create mode 100644 .github/workflows/validate-kernel-commits.yml diff --git a/.github/workflows/upstream-commit-check.yml b/.github/workflows/upstream-commit-check.yml deleted file mode 100644 index e95c4e904f8e4..0000000000000 --- a/.github/workflows/upstream-commit-check.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Check Kernel Commits for Upstream Fixes - -on: - pull_request: - types: [opened, synchronize, reopened] - -permissions: - contents: read - pull-requests: write - -jobs: - check-upstream-fixes: - runs-on: ubuntu-latest - - steps: - - name: Checkout PR branch - uses: actions/checkout@v4 - with: - repository: ${{ github.event.pull_request.head.repo.full_name }} - fetch-depth: 0 - ref: ${{ github.head_ref }} - - - name: Checkout base branch - run: | - git remote add base_repo https://github.com/${{ github.repository }}.git - git fetch base_repo ${{ github.base_ref }}:${{ github.base_ref }} - - - name: Download check_kernel_commits.py - run: | - curl -sL \ - https://raw.githubusercontent.com/ctrliq/kernel-src-tree-tools/mainline/check_kernel_commits.py \ - -o check_kernel_commits.py - chmod +x check_kernel_commits.py - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - - name: Run upstream fixes check - id: checkkernel - run: | - python3 check_kernel_commits.py --repo . --pr_branch "${{ github.head_ref }}" --base_branch "${{ github.base_ref }}" --markdown | tee result.txt - # Save non-empty results for PR comment - if grep -q -v "All referenced commits exist upstream and have no Fixes: tags." result.txt; then - echo "has_findings=true" >> $GITHUB_OUTPUT - fi - - - name: Comment on PR if issues found - if: steps.checkkernel.outputs.has_findings == 'true' - env: - GH_TOKEN: ${{ github.token }} - run: | - gh pr comment ${{ github.event.pull_request.number }} \ - --body "$(cat result.txt)" \ - --repo ${{ github.repository }} diff --git a/.github/workflows/validate-kernel-commits.yml b/.github/workflows/validate-kernel-commits.yml new file mode 100644 index 0000000000000..c74434336e251 --- /dev/null +++ b/.github/workflows/validate-kernel-commits.yml @@ -0,0 +1,10 @@ +name: Validate Kernel Commits + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + check: + uses: ctrliq/kernel-src-tree/.github/workflows/validate-kernel-commits.yml@main + secrets: inherit From 8002b59569af6f3190fed84072c360c25fbd4908 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Thu, 12 Mar 2026 14:46:21 +0100 Subject: [PATCH 19/50] github actions: Add kernelCI for rlc-8 Signed-off-by: Roxana Nicolescu --- .container_build_image | 1 + .github/workflows/kernel-build-and-test-multiarch.yml | 11 +++++++++++ 2 files changed, 12 insertions(+) create mode 100644 .container_build_image create mode 100644 .github/workflows/kernel-build-and-test-multiarch.yml diff --git a/.container_build_image b/.container_build_image new file mode 100644 index 0000000000000..e8712bc3cb180 --- /dev/null +++ b/.container_build_image @@ -0,0 +1 @@ +rocky-8-kernel-builder diff --git a/.github/workflows/kernel-build-and-test-multiarch.yml b/.github/workflows/kernel-build-and-test-multiarch.yml new file mode 100644 index 0000000000000..f5177d72d4948 --- /dev/null +++ b/.github/workflows/kernel-build-and-test-multiarch.yml @@ -0,0 +1,11 @@ +name: Automated kernel build and test x86_64 and aarch64 + +on: + push: + branches: + - '*_rlc-8/**' + +jobs: + kernelCI: + uses: ctrliq/kernel-src-tree/.github/workflows/kernel-build-and-test-multiarch.yml@main + secrets: inherit From 92d8f259c0c4139134003fbfc9a6a1d7124727f0 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Wed, 7 Aug 2024 12:04:26 -0400 Subject: [PATCH 20/50] net/mlx5e: Fix features validation check for tunneled UDP (non-VXLAN) packets jira LE-1733 bugfix geneve_fixes commit 791b4089e326271424b78f2fae778b20e53d071b Move the vxlan_features_check() call to after we verified the packet is a tunneled VXLAN packet. Without this, tunneled UDP non-VXLAN packets (for ex. GENENVE) might wrongly not get offloaded. In some cases, it worked by chance as GENEVE header is the same size as VXLAN, but it is obviously incorrect. Fixes: e3cfc7e6b7bd ("net/mlx5e: TX, Add geneve tunnel stateless offload support") Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Signed-off-by: David S. Miller (cherry picked from commit 791b4089e326271424b78f2fae778b20e53d071b) Signed-off-by: Jonathan Maple --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 799726f4ec47f..bd8a364c9e21c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4424,7 +4424,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, /* Verify if UDP port is being offloaded by HW */ if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port)) - return features; + return vxlan_features_check(skb, features); #if IS_ENABLED(CONFIG_GENEVE) /* Support Geneve offload for default UDP port */ @@ -4450,7 +4450,6 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb, struct mlx5e_priv *priv = netdev_priv(netdev); features = vlan_features_check(skb, features); - features = vxlan_features_check(skb, features); /* Validate if the tunneled packet is being offloaded by HW */ if (skb->encapsulation && From c9b84bb6b4988e484eaac384f98b40f874b22b6b Mon Sep 17 00:00:00 2001 From: David Gomez Date: Tue, 18 Feb 2025 14:51:06 -0800 Subject: [PATCH 21/50] tipc: Fix use-after-free of kernel socket in cleanup_bearer(). jira VULN-12931 cve CVE-2024-56642 commit-author Kuniyuki Iwashima commit 6a2fa13312e51a621f652d522d7e2df7066330b6 syzkaller reported a use-after-free of UDP kernel socket in cleanup_bearer() without repro. [0][1] When bearer_disable() calls tipc_udp_disable(), cleanup of the UDP kernel socket is deferred by work calling cleanup_bearer(). tipc_net_stop() waits for such works to finish by checking tipc_net(net)->wq_count. However, the work decrements the count too early before releasing the kernel socket, unblocking cleanup_net() and resulting in use-after-free. Let's move the decrement after releasing the socket in cleanup_bearer(). [0]: ref_tracker: net notrefcnt@000000009b3d1faf has 1/1 users at sk_alloc+0x438/0x608 inet_create+0x4c8/0xcb0 __sock_create+0x350/0x6b8 sock_create_kern+0x58/0x78 udp_sock_create4+0x68/0x398 udp_sock_create+0x88/0xc8 tipc_udp_enable+0x5e8/0x848 __tipc_nl_bearer_enable+0x84c/0xed8 tipc_nl_bearer_enable+0x38/0x60 genl_family_rcv_msg_doit+0x170/0x248 genl_rcv_msg+0x400/0x5b0 netlink_rcv_skb+0x1dc/0x398 genl_rcv+0x44/0x68 netlink_unicast+0x678/0x8b0 netlink_sendmsg+0x5e4/0x898 ____sys_sendmsg+0x500/0x830 [1]: BUG: KMSAN: use-after-free in udp_hashslot include/net/udp.h:85 [inline] BUG: KMSAN: use-after-free in udp_lib_unhash+0x3b8/0x930 net/ipv4/udp.c:1979 udp_hashslot include/net/udp.h:85 [inline] udp_lib_unhash+0x3b8/0x930 net/ipv4/udp.c:1979 sk_common_release+0xaf/0x3f0 net/core/sock.c:3820 inet_release+0x1e0/0x260 net/ipv4/af_inet.c:437 inet6_release+0x6f/0xd0 net/ipv6/af_inet6.c:489 __sock_release net/socket.c:658 [inline] sock_release+0xa0/0x210 net/socket.c:686 cleanup_bearer+0x42d/0x4c0 net/tipc/udp_media.c:819 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xcaf/0x1c90 kernel/workqueue.c:3310 worker_thread+0xf6c/0x1510 kernel/workqueue.c:3391 kthread+0x531/0x6b0 kernel/kthread.c:389 ret_from_fork+0x60/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 Uninit was created at: slab_free_hook mm/slub.c:2269 [inline] slab_free mm/slub.c:4580 [inline] kmem_cache_free+0x207/0xc40 mm/slub.c:4682 net_free net/core/net_namespace.c:454 [inline] cleanup_net+0x16f2/0x19d0 net/core/net_namespace.c:647 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xcaf/0x1c90 kernel/workqueue.c:3310 worker_thread+0xf6c/0x1510 kernel/workqueue.c:3391 kthread+0x531/0x6b0 kernel/kthread.c:389 ret_from_fork+0x60/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 CPU: 0 UID: 0 PID: 54 Comm: kworker/0:2 Not tainted 6.12.0-rc1-00131-gf66ebf37d69c #7 91723d6f74857f70725e1583cba3cf4adc716cfa Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 Workqueue: events cleanup_bearer Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241127050512.28438-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni (cherry picked from commit 6a2fa13312e51a621f652d522d7e2df7066330b6) Signed-off-by: David Gomez --- net/tipc/udp_media.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 32009d7bd73bc..3ce73681c629a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -810,10 +810,10 @@ static void cleanup_bearer(struct work_struct *work) kfree_rcu(rcast, rcu); } - atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); synchronize_net(); + atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); kfree(ub); } From 661b91bb8f9d5f038b7fbf4cd6eb3179d69aed55 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 13 May 2025 13:41:41 +0000 Subject: [PATCH 22/50] nvme-tcp: fix potential memory corruption in nvme_tcp_recv_pdu() jira VULN-56026 cve CVE-2025-21927 commit-author Maurizio Lombardi commit ad95bab0cd28ed77c2c0d0b6e76e03e031391064 upstream-diff Removed `nvme_tcp_c2h_term' case from `nvme_tcp_recv_pdu_supported' for the sake of consistency of `nvme_tcp_recv_pdu''s behavior relative to the upstream version, between the cases of proper and improper header. (What could be considered as "`c2h_term' type support" started with 84e009042d0f3dfe91bec60bcd208ee3f866cbcd commit, not included in `ciqlts9_2''s history, so `nvme_tcp_recv_pdu_supported' in `ciqlts9_2' shouldn't report the `nvme_tcp_c2h_term' type as supported.) nvme_tcp_recv_pdu() doesn't check the validity of the header length. When header digests are enabled, a target might send a packet with an invalid header length (e.g. 255), causing nvme_tcp_verify_hdgst() to access memory outside the allocated area and cause memory corruptions by overwriting it with the calculated digest. Fix this by rejecting packets with an unexpected header length. Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch (cherry picked from commit ad95bab0cd28ed77c2c0d0b6e76e03e031391064) Signed-off-by: Brett Mastbergen --- drivers/nvme/host/tcp.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f41e96f0e847e..35de6f091014c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -152,6 +152,18 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type) +{ + switch (type) { + case nvme_tcp_c2h_data: + case nvme_tcp_r2t: + case nvme_tcp_rsp: + return true; + default: + return false; + } +} + static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) { u32 queue_idx = nvme_tcp_queue_id(queue); @@ -674,6 +686,16 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) { + if (!nvme_tcp_recv_pdu_supported(hdr->type)) + goto unsupported_pdu; + + dev_err(queue->ctrl->ctrl.device, + "pdu type %d has unexpected header length (%d)\n", + hdr->type, hdr->hlen); + return -EPROTO; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) @@ -697,10 +719,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_init_recv_ctx(queue); return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: - dev_err(queue->ctrl->ctrl.device, - "unsupported pdu type (%d)\n", hdr->type); - return -EINVAL; + goto unsupported_pdu; } + +unsupported_pdu: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) From 631f1ff2afaf0f97ec02928e2202fe454a2a7270 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Wed, 18 Jun 2025 16:02:39 +0000 Subject: [PATCH 23/50] netdevsim: Fix memory leak of nsim_dev->fa_cookie jira VULN-65790 cve CVE-2022-49803 commit-author Wang Yufen commit 064bc7312bd09a48798418663090be0c776183db kmemleak reports this issue: unreferenced object 0xffff8881bac872d0 (size 8): comm "sh", pid 58603, jiffies 4481524462 (age 68.065s) hex dump (first 8 bytes): 04 00 00 00 de ad be ef ........ backtrace: [<00000000c80b8577>] __kmalloc+0x49/0x150 [<000000005292b8c6>] nsim_dev_trap_fa_cookie_write+0xc1/0x210 [netdevsim] [<0000000093d78e77>] full_proxy_write+0xf3/0x180 [<000000005a662c16>] vfs_write+0x1c5/0xaf0 [<000000007aabf84a>] ksys_write+0xed/0x1c0 [<000000005f1d2e47>] do_syscall_64+0x3b/0x90 [<000000006001c6ec>] entry_SYSCALL_64_after_hwframe+0x63/0xcd The issue occurs in the following scenarios: nsim_dev_trap_fa_cookie_write() kmalloc() fa_cookie nsim_dev->fa_cookie = fa_cookie .. nsim_drv_remove() The fa_cookie allocked in nsim_dev_trap_fa_cookie_write() is not freed. To fix, add kfree(nsim_dev->fa_cookie) to nsim_drv_remove(). Fixes: d3cbb907ae57 ("netdevsim: add ACL trap reporting cookie as a metadata") Signed-off-by: Wang Yufen Cc: Jiri Pirko Link: https://lore.kernel.org/r/1668504625-14698-1-git-send-email-wangyufen@huawei.com Signed-off-by: Jakub Kicinski (cherry picked from commit 064bc7312bd09a48798418663090be0c776183db) Signed-off-by: Brett Mastbergen --- drivers/net/netdevsim/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 74ca64e0a2c8a..fa2ad8299d9b7 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -1647,6 +1647,7 @@ void nsim_drv_remove(struct nsim_bus_dev *nsim_bus_dev) ARRAY_SIZE(nsim_devlink_params)); devl_resources_unregister(devlink); kfree(nsim_dev->vfconfigs); + kfree(nsim_dev->fa_cookie); devl_unlock(devlink); devlink_free(devlink); dev_set_drvdata(&nsim_bus_dev->dev, NULL); From acfb5bb8a724ce4f01f511d2ebf2d8cc1d4e5425 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Fri, 19 Sep 2025 14:31:20 -0400 Subject: [PATCH 24/50] gso: fix udp gso fraglist segmentation after pull from frag_list jira VULN-45766 jira VULN-45767 cve cve-2024-49978 commit-author Willem de Bruijn commit a1e40ac5b5e9077fe1f7ae0eb88034db0f9ae1ab upstream-diff contextual diff is off due to massive reworks. In addition __udpv6_gso_segment_list_csum definition is not included. This was included via "net/gro.h" via 75082e7f4680 which is a bug fix to 4721031c3559 "net: move gro definitions to include/net/gro.h". Since we also do not have that we're just directly including net/ip6_checksum.h to this file. Detect gso fraglist skbs with corrupted geometry (see below) and pass these to skb_segment instead of skb_segment_list, as the first can segment them correctly. Valid SKB_GSO_FRAGLIST skbs - consist of two or more segments - the head_skb holds the protocol headers plus first gso_size - one or more frag_list skbs hold exactly one segment - all but the last must be gso_size Optional datapath hooks such as NAT and BPF (bpf_skb_pull_data) can modify these skbs, breaking these invariants. In extreme cases they pull all data into skb linear. For UDP, this causes a NULL ptr deref in __udpv4_gso_segment_list_csum at udp_hdr(seg->next)->dest. Detect invalid geometry due to pull, by checking head_skb size. Don't just drop, as this may blackhole a destination. Convert to be able to pass to regular skb_segment. Link: https://lore.kernel.org/netdev/20240428142913.18666-1-shiming.cheng@mediatek.com/ Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Willem de Bruijn Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20241001171752.107580-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski (cherry picked from commit a1e40ac5b5e9077fe1f7ae0eb88034db0f9ae1ab) Signed-off-by: Jonathan Maple --- net/ipv4/udp_offload.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d5aee2190943e..f52f5d63464af 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -276,8 +277,26 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __sum16 check; __be16 newlen; - if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) - return __udp_gso_segment_list(gso_skb, features, is_ipv6); + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { + /* Detect modified geometry and pass those to skb_segment. */ + if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) + return __udp_gso_segment_list(gso_skb, features, is_ipv6); + + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ + gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; + gso_skb->csum_offset = offsetof(struct udphdr, check); + gso_skb->ip_summed = CHECKSUM_PARTIAL; + + uh = udp_hdr(gso_skb); + if (is_ipv6) + uh->check = ~udp_v6_check(gso_skb->len, + &ipv6_hdr(gso_skb)->saddr, + &ipv6_hdr(gso_skb)->daddr, 0); + else + uh->check = ~udp_v4_check(gso_skb->len, + ip_hdr(gso_skb)->saddr, + ip_hdr(gso_skb)->daddr, 0); + } mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) From 82327648d47b65efc5cd5dc7ff0c09f7ad29dfcf Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Wed, 24 Sep 2025 16:11:22 -0400 Subject: [PATCH 25/50] bpf: Fix a segment issue when downgrading gso_size jira VULN-38750 jira VULN-38751 cve CVE-2024-42281 commit-author Fred Li commit fa5ef655615a01533035c6139248c5b33aa27028 Linearize the skb when downgrading gso_size because it may trigger a BUG_ON() later when the skb is segmented as described in [1,2]. Fixes: 2be7e212d5419 ("bpf: add bpf_skb_adjust_room helper") Signed-off-by: Fred Li Signed-off-by: Daniel Borkmann Reviewed-by: Willem de Bruijn Acked-by: Daniel Borkmann Link: https://lore.kernel.org/all/20240626065555.35460-2-dracodingfly@gmail.com [1] Link: https://lore.kernel.org/all/668d5cf1ec330_1c18c32947@willemb.c.googlers.com.notmuch [2] Link: https://lore.kernel.org/bpf/20240719024653.77006-1-dracodingfly@gmail.com (cherry picked from commit fa5ef655615a01533035c6139248c5b33aa27028) Signed-off-by: Jonathan Maple --- net/core/filter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 06c6ff7fb511b..2bffec483f26e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3497,13 +3497,20 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Due to header grow, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } } return 0; From e7c0b7848f7907792dae3a475368240ee3cf8c3c Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Sep 2025 13:36:44 -0400 Subject: [PATCH 26/50] net: fix udp gso skb_segment after pull from frag_list jira VULN-156444 jira VULN-156445 cve CVE-2025-38124 commit-author Shiming Cheng commit 3382a1ed7f778db841063f5d7e317ac55f9e7f72 Commit a1e40ac5b5e9 ("net: gso: fix udp gso fraglist segmentation after pull from frag_list") detected invalid geometry in frag_list skbs and redirects them from skb_segment_list to more robust skb_segment. But some packets with modified geometry can also hit bugs in that code. We don't know how many such cases exist. Addressing each one by one also requires touching the complex skb_segment code, which risks introducing bugs for other types of skbs. Instead, linearize all these packets that fail the basic invariants on gso fraglist skbs. That is more robust. If only part of the fraglist payload is pulled into head_skb, it will always cause exception when splitting skbs by skb_segment. For detailed call stack information, see below. Valid SKB_GSO_FRAGLIST skbs - consist of two or more segments - the head_skb holds the protocol headers plus first gso_size - one or more frag_list skbs hold exactly one segment - all but the last must be gso_size Optional datapath hooks such as NAT and BPF (bpf_skb_pull_data) can modify fraglist skbs, breaking these invariants. In extreme cases they pull one part of data into skb linear. For UDP, this causes three payloads with lengths of (11,11,10) bytes were pulled tail to become (12,10,10) bytes. The skbs no longer meets the above SKB_GSO_FRAGLIST conditions because payload was pulled into head_skb, it needs to be linearized before pass to regular skb_segment. skb_segment+0xcd0/0xd14 __udp_gso_segment+0x334/0x5f4 udp4_ufo_fragment+0x118/0x15c inet_gso_segment+0x164/0x338 skb_mac_gso_segment+0xc4/0x13c __skb_gso_segment+0xc4/0x124 validate_xmit_skb+0x9c/0x2c0 validate_xmit_skb_list+0x4c/0x80 sch_direct_xmit+0x70/0x404 __dev_queue_xmit+0x64c/0xe5c neigh_resolve_output+0x178/0x1c4 ip_finish_output2+0x37c/0x47c __ip_finish_output+0x194/0x240 ip_finish_output+0x20/0xf4 ip_output+0x100/0x1a0 NF_HOOK+0xc4/0x16c ip_forward+0x314/0x32c ip_rcv+0x90/0x118 __netif_receive_skb+0x74/0x124 process_backlog+0xe8/0x1a4 __napi_poll+0x5c/0x1f8 net_rx_action+0x154/0x314 handle_softirqs+0x154/0x4b8 [118.376811] [C201134] rxq0_pus: [name:bug&]kernel BUG at net/core/skbuff.c:4278! [118.376829] [C201134] rxq0_pus: [name:traps&]Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP [118.470774] [C201134] rxq0_pus: [name:mrdump&]Kernel Offset: 0x178cc00000 from 0xffffffc008000000 [118.470810] [C201134] rxq0_pus: [name:mrdump&]PHYS_OFFSET: 0x40000000 [118.470827] [C201134] rxq0_pus: [name:mrdump&]pstate: 60400005 (nZCv daif +PAN -UAO) [118.470848] [C201134] rxq0_pus: [name:mrdump&]pc : [0xffffffd79598aefc] skb_segment+0xcd0/0xd14 [118.470900] [C201134] rxq0_pus: [name:mrdump&]lr : [0xffffffd79598a5e8] skb_segment+0x3bc/0xd14 [118.470928] [C201134] rxq0_pus: [name:mrdump&]sp : ffffffc008013770 Fixes: a1e40ac5b5e9 ("gso: fix udp gso fraglist segmentation after pull from frag_list") Signed-off-by: Shiming Cheng Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller (cherry picked from commit 3382a1ed7f778db841063f5d7e317ac55f9e7f72) Signed-off-by: Jonathan Maple --- net/ipv4/udp_offload.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f52f5d63464af..d12b9e82e8478 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -276,12 +276,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { /* Detect modified geometry and pass those to skb_segment. */ if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); From 83fc55571619124910944f0d3be2c294babfc156 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 11 Oct 2023 04:01:10 +0000 Subject: [PATCH 27/50] x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach jira roc-2673 commit fbf6449f84bf5e4ad09f2c09ee70ed7d629b5ff6 Instead of setting x86_virt_bits to a possibly-correct value and then correcting it later, do all the necessary checks before setting it. At this point, the #VC handler references boot_cpu_data.x86_virt_bits, and in the previous version, it would be triggered by the CPUIDs between the point at which it is set to 48 and when it is set to the correct value. Suggested-by: Dave Hansen Signed-off-by: Adam Dunlap Signed-off-by: Ingo Molnar Tested-by: Jacob Xu Link: https://lore.kernel.org/r/20230912002703.3924521-3-acdunlap@google.com Signed-off-by: Ronnie Sahlberg --- arch/x86/kernel/cpu/common.c | 37 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4130d43c6435a..b0ee6e921c2e8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1020,17 +1020,32 @@ void get_cpu_cap(struct cpuinfo_x86 *c) static void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + bool vp_bits_from_cpuid = true; - if (c->extended_cpuid_level >= 0x80000008) { + if (!cpu_has(c, X86_FEATURE_CPUID) || + (c->extended_cpuid_level < 0x80000008)) + vp_bits_from_cpuid = false; + + if (vp_bits_from_cpuid) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + } else { + if (IS_ENABLED(CONFIG_X86_64)) { + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; + } else { + c->x86_clflush_size = 32; + c->x86_virt_bits = 32; + c->x86_phys_bits = 32; + + if (cpu_has(c, X86_FEATURE_PAE) || + cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; + } } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif c->x86_cache_bits = c->x86_phys_bits; } @@ -1485,15 +1500,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; - c->x86_phys_bits = 36; - c->x86_virt_bits = 48; -#else - c->x86_clflush_size = 32; - c->x86_phys_bits = 32; - c->x86_virt_bits = 32; -#endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof(c->x86_capability)); @@ -1505,7 +1511,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_vendor(c); get_cpu_cap(c); get_model_name(c); /* RHEL8: get model name for unsupported check */ - get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); cpu_parse_early_param(); @@ -1522,6 +1527,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_CPUID); } + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); From a7f3126b189b68c3716a3cc8e7329d67cb5e1aa3 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 11 Oct 2023 04:04:08 +0000 Subject: [PATCH 28/50] x86/boot: Move x86_cache_alignment initialization to correct spot jira roc-2673 commit 3e32552652917f10c0aa8ac75cdc8f0b8d257dec c->x86_cache_alignment is initialized from c->x86_clflush_size. However, commit fbf6449f84bf moved c->x86_clflush_size initialization to later in boot without moving the c->x86_cache_alignment assignment: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") This presumably left c->x86_cache_alignment set to zero for longer than it should be. The result was an oops on 32-bit kernels while accessing a pointer at 0x20. The 0x20 came from accessing a structure member at offset 0x10 (buffer->cpumask) from a ZERO_SIZE_PTR=0x10. kmalloc() can evidently return ZERO_SIZE_PTR when it's given 0 as its alignment requirement. Move the c->x86_cache_alignment initialization to be after c->x86_clflush_size has an actual value. Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20231002220045.1014760-1-dave.hansen@linux.intel.com (cherry picked from commit 3e32552652917f10c0aa8ac75cdc8f0b8d257dec) Signed-off-by: Ronnie Sahlberg --- arch/x86/kernel/cpu/common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b0ee6e921c2e8..cbb4f0bd8ec71 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1047,6 +1047,7 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c) } } c->x86_cache_bits = c->x86_phys_bits; + c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1500,8 +1501,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; From fed0dd9d31f0e9d44a87b96e35d7dd5952a57942 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 26 Dec 2024 14:34:31 -0500 Subject: [PATCH 29/50] x86/cpu: Allow reducing x86_phys_bits during early_identify_cpu() jira LE-2183 bug-fix x86/sev-es: Set x86_virt_bits commit-author Paolo Bonzini commit 9a458198eba98b7207669a166e64d04b04cb651b In commit fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach"), the initialization of c->x86_phys_bits was moved after this_cpu->c_early_init(c). This is incorrect because early_init_amd() expected to be able to reduce the value according to the contents of CPUID leaf 0x8000001f. Fortunately, the bug was negated by init_amd()'s call to early_init_amd(), which does reduce x86_phys_bits in the end. However, this is very late in the boot process and, most notably, the wrong value is used for x86_phys_bits when setting up MTRRs. To fix this, call get_cpu_address_sizes() as soon as X86_FEATURE_CPUID is set/cleared, and c->extended_cpuid_level is retrieved. Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") Signed-off-by: Paolo Bonzini Signed-off-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20240131230902.1867092-2-pbonzini%40redhat.com (cherry picked from commit 9a458198eba98b7207669a166e64d04b04cb651b) Signed-off-by: Jonathan Maple --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cbb4f0bd8ec71..bdf0af9cf51d7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1511,6 +1511,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); get_model_name(c); /* RHEL8: get model name for unsupported check */ setup_force_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); cpu_parse_early_param(); if (this_cpu->c_early_init) @@ -1524,10 +1525,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } else { identify_cpu_without_cpuid(c); setup_clear_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); } - get_cpu_address_sizes(c); - setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); From ee37d864e3b44e1df442e69e60c6ee6f01f0d762 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 26 Dec 2024 14:44:56 -0500 Subject: [PATCH 30/50] x86/cpu: Get rid of an unnecessary local variable in get_cpu_address_sizes() jira LE-2183 bug-fix-prereq x86/sev-es: Set x86_virt_bits commit-author Borislav Petkov (AMD) commit 95bfb35269b2e85cff0dd2c957b2d42ebf95ae5f Drop 'vp_bits_from_cpuid' as it is not really needed. No functional changes. Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240316120706.4352-1-bp@alien8.de (cherry picked from commit 95bfb35269b2e85cff0dd2c957b2d42ebf95ae5f) Signed-off-by: Jonathan Maple --- arch/x86/kernel/cpu/common.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bdf0af9cf51d7..55f4839ea384b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1020,18 +1020,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) static void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - bool vp_bits_from_cpuid = true; if (!cpu_has(c, X86_FEATURE_CPUID) || - (c->extended_cpuid_level < 0x80000008)) - vp_bits_from_cpuid = false; - - if (vp_bits_from_cpuid) { - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } else { + (c->extended_cpuid_level < 0x80000008)) { if (IS_ENABLED(CONFIG_X86_64)) { c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1045,7 +1036,13 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_PSE36)) c->x86_phys_bits = 36; } + } else { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; } + c->x86_cache_bits = c->x86_phys_bits; c->x86_cache_alignment = c->x86_clflush_size; } From be96535f5b046c267851a6a23faa14dc39b7c31d Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 26 Dec 2024 14:45:08 -0500 Subject: [PATCH 31/50] x86/cpu: Provide default cache line size if not enumerated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira LE-2183 bug-fix x86/sev-es: Set x86_virt_bits commit-author Dave Hansen commit 2a38e4ca302280fdcce370ba2bee79bac16c4587 tl;dr: CPUs with CPUID.80000008H but without CPUID.01H:EDX[CLFSH] will end up reporting cache_line_size()==0 and bad things happen. Fill in a default on those to avoid the problem. Long Story: The kernel dies a horrible death if c->x86_cache_alignment (aka. cache_line_size() is 0. Normally, this value is populated from c->x86_clflush_size. Right now the code is set up to get c->x86_clflush_size from two places. First, modern CPUs get it from CPUID. Old CPUs that don't have leaf 0x80000008 (or CPUID at all) just get some sane defaults from the kernel in get_cpu_address_sizes(). The vast majority of CPUs that have leaf 0x80000008 also get ->x86_clflush_size from CPUID. But there are oddballs. Intel Quark CPUs[1] and others[2] have leaf 0x80000008 but don't set CPUID.01H:EDX[CLFSH], so they skip over filling in ->x86_clflush_size: cpuid(0x00000001, &tfms, &misc, &junk, &cap0); if (cap0 & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; So they: land in get_cpu_address_sizes() and see that CPUID has level 0x80000008 and jump into the side of the if() that does not fill in c->x86_clflush_size. That assigns a 0 to c->x86_cache_alignment, and hilarity ensues in code like: buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); To fix this, always provide a sane value for ->x86_clflush_size. Big thanks to Andy Shevchenko for finding and reporting this and also providing a first pass at a fix. But his fix was only partial and only worked on the Quark CPUs. It would not, for instance, have worked on the QEMU config. 1. https://raw.githubusercontent.com/InstLatx64/InstLatx64/master/GenuineIntel/GenuineIntel0000590_Clanton_03_CPUID.txt 2. You can also get this behavior if you use "-cpu 486,+clzero" in QEMU. [ dhansen: remove 'vp_bits_from_cpuid' reference in changelog because bpetkov brutally murdered it recently. ] Fixes: fbf6449f84bf ("x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach") Reported-by: Andy Shevchenko Signed-off-by: Dave Hansen Tested-by: Andy Shevchenko Tested-by: JΓΆrn Heusipp Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240516173928.3960193-1-andriy.shevchenko@linux.intel.com/ Link: https://lore.kernel.org/lkml/5e31cad3-ad4d-493e-ab07-724cfbfaba44@heusipp.de/ Link: https://lore.kernel.org/all/20240517200534.8EC5F33E%40davehans-spike.ostc.intel.com (cherry picked from commit 2a38e4ca302280fdcce370ba2bee79bac16c4587) Signed-off-by: Jonathan Maple --- arch/x86/kernel/cpu/common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 55f4839ea384b..c4cb3f660b838 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1041,6 +1041,10 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } c->x86_cache_bits = c->x86_phys_bits; From 6c25335421a39c9752edcbec85c55713145620ee Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 19 Aug 2025 09:41:58 +0000 Subject: [PATCH 32/50] net: mana: Enable MANA driver on ARM64 with 4K page size jira LE-3812 commit-author Haiyang Zhang commit 40a1d11fc670ac03c5dc2e5a9724b330e74f38b0 Change the Kconfig dependency, so this driver can be built and run on ARM64 with 4K page size. 16/64K page sizes are not supported yet. Signed-off-by: Haiyang Zhang Link: https://lore.kernel.org/r/1715632141-8089-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Jakub Kicinski (cherry picked from commit 40a1d11fc670ac03c5dc2e5a9724b330e74f38b0) Signed-off-by: Shreeya Patel --- drivers/net/ethernet/microsoft/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/Kconfig b/drivers/net/ethernet/microsoft/Kconfig index 090e6b9832431..61cb91c3a5dbd 100644 --- a/drivers/net/ethernet/microsoft/Kconfig +++ b/drivers/net/ethernet/microsoft/Kconfig @@ -17,7 +17,8 @@ if NET_VENDOR_MICROSOFT config MICROSOFT_MANA tristate "Microsoft Azure Network Adapter (MANA) support" - depends on PCI_MSI && X86_64 + depends on PCI_MSI + depends on X86_64 || (ARM64 && !CPU_BIG_ENDIAN && ARM64_4K_PAGES) depends on PCI_HYPERV select AUXILIARY_BUS help From cb9820020e0e4a6052d592ecd2287e0430e2e0ae Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 19 Aug 2025 09:42:06 +0000 Subject: [PATCH 33/50] net: mana: Add support for page sizes other than 4KB on ARM64 jira LE-3812 commit-author Haiyang Zhang commit 382d1741b5b2feffef7942dd074206372afe1a96 As defined by the MANA Hardware spec, the queue size for DMA is 4KB minimal, and power of 2. And, the HWC queue size has to be exactly 4KB. To support page sizes other than 4KB on ARM64, define the minimal queue size as a macro separately from the PAGE_SIZE, which we always assumed it to be 4KB before supporting ARM64. Also, add MANA specific macros and update code related to size alignment, DMA region calculations, etc. Signed-off-by: Haiyang Zhang Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1718655446-6576-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Jakub Kicinski (cherry picked from commit 382d1741b5b2feffef7942dd074206372afe1a96) Signed-off-by: Shreeya Patel --- drivers/net/ethernet/microsoft/Kconfig | 2 +- drivers/net/ethernet/microsoft/mana/gdma_main.c | 10 +++++----- drivers/net/ethernet/microsoft/mana/hw_channel.c | 14 +++++++------- drivers/net/ethernet/microsoft/mana/mana_en.c | 8 ++++---- drivers/net/ethernet/microsoft/mana/shm_channel.c | 13 +++++++------ include/net/mana/gdma.h | 10 +++++++++- include/net/mana/mana.h | 3 ++- 7 files changed, 35 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/microsoft/Kconfig b/drivers/net/ethernet/microsoft/Kconfig index 61cb91c3a5dbd..77bc47cbfbbf4 100644 --- a/drivers/net/ethernet/microsoft/Kconfig +++ b/drivers/net/ethernet/microsoft/Kconfig @@ -18,7 +18,7 @@ if NET_VENDOR_MICROSOFT config MICROSOFT_MANA tristate "Microsoft Azure Network Adapter (MANA) support" depends on PCI_MSI - depends on X86_64 || (ARM64 && !CPU_BIG_ENDIAN && ARM64_4K_PAGES) + depends on X86_64 || (ARM64 && !CPU_BIG_ENDIAN) depends on PCI_HYPERV select AUXILIARY_BUS help diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index f1b4b0b6ae65b..8d9e019bf1162 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -182,7 +182,7 @@ int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, dma_addr_t dma_handle; void *buf; - if (length < PAGE_SIZE || !is_power_of_2(length)) + if (length < MANA_PAGE_SIZE || !is_power_of_2(length)) return -EINVAL; gmi->dev = gc->dev; @@ -717,7 +717,7 @@ EXPORT_SYMBOL(mana_gd_destroy_dma_region); static int mana_gd_create_dma_region(struct gdma_dev *gd, struct gdma_mem_info *gmi) { - unsigned int num_page = gmi->length / PAGE_SIZE; + unsigned int num_page = gmi->length / MANA_PAGE_SIZE; struct gdma_create_dma_region_req *req = NULL; struct gdma_create_dma_region_resp resp = {}; struct gdma_context *gc = gd->gdma_context; @@ -727,10 +727,10 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, int err; int i; - if (length < PAGE_SIZE || !is_power_of_2(length)) + if (length < MANA_PAGE_SIZE || !is_power_of_2(length)) return -EINVAL; - if (offset_in_page(gmi->virt_addr) != 0) + if (!MANA_PAGE_ALIGNED(gmi->virt_addr)) return -EINVAL; hwc = gc->hwc.driver_data; @@ -751,7 +751,7 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, req->page_addr_list_len = num_page; for (i = 0; i < num_page; i++) - req->page_addr_list[i] = gmi->dma_handle + i * PAGE_SIZE; + req->page_addr_list[i] = gmi->dma_handle + i * MANA_PAGE_SIZE; err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp); if (err) diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 3a31ba66b821e..d2a339bc1cd25 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -361,12 +361,12 @@ static int mana_hwc_create_cq(struct hw_channel_context *hwc, u16 q_depth, int err; eq_size = roundup_pow_of_two(GDMA_EQE_SIZE * q_depth); - if (eq_size < MINIMUM_SUPPORTED_PAGE_SIZE) - eq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + if (eq_size < MANA_MIN_QSIZE) + eq_size = MANA_MIN_QSIZE; cq_size = roundup_pow_of_two(GDMA_CQE_SIZE * q_depth); - if (cq_size < MINIMUM_SUPPORTED_PAGE_SIZE) - cq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + if (cq_size < MANA_MIN_QSIZE) + cq_size = MANA_MIN_QSIZE; hwc_cq = kzalloc(sizeof(*hwc_cq), GFP_KERNEL); if (!hwc_cq) @@ -428,7 +428,7 @@ static int mana_hwc_alloc_dma_buf(struct hw_channel_context *hwc, u16 q_depth, dma_buf->num_reqs = q_depth; - buf_size = PAGE_ALIGN(q_depth * max_msg_size); + buf_size = MANA_PAGE_ALIGN(q_depth * max_msg_size); gmi = &dma_buf->mem_info; err = mana_gd_alloc_memory(gc, buf_size, gmi); @@ -496,8 +496,8 @@ static int mana_hwc_create_wq(struct hw_channel_context *hwc, else queue_size = roundup_pow_of_two(GDMA_MAX_SQE_SIZE * q_depth); - if (queue_size < MINIMUM_SUPPORTED_PAGE_SIZE) - queue_size = MINIMUM_SUPPORTED_PAGE_SIZE; + if (queue_size < MANA_MIN_QSIZE) + queue_size = MANA_MIN_QSIZE; hwc_wq = kzalloc(sizeof(*hwc_wq), GFP_KERNEL); if (!hwc_wq) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index e16317aadbca4..d80dd8baefdeb 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1867,10 +1867,10 @@ static int mana_create_txq(struct mana_port_context *apc, * to prevent overflow. */ txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32; - BUILD_BUG_ON(!PAGE_ALIGNED(txq_size)); + BUILD_BUG_ON(!MANA_PAGE_ALIGNED(txq_size)); cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE; - cq_size = PAGE_ALIGN(cq_size); + cq_size = MANA_PAGE_ALIGN(cq_size); gc = gd->gdma_context; @@ -2128,8 +2128,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, if (err) goto out; - rq_size = PAGE_ALIGN(rq_size); - cq_size = PAGE_ALIGN(cq_size); + rq_size = MANA_PAGE_ALIGN(rq_size); + cq_size = MANA_PAGE_ALIGN(cq_size); /* Create RQ */ memset(&spec, 0, sizeof(spec)); diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.c b/drivers/net/ethernet/microsoft/mana/shm_channel.c index 5553af9c8085a..0f1679ebad96b 100644 --- a/drivers/net/ethernet/microsoft/mana/shm_channel.c +++ b/drivers/net/ethernet/microsoft/mana/shm_channel.c @@ -6,6 +6,7 @@ #include #include +#include #include #define PAGE_FRAME_L48_WIDTH_BYTES 6 @@ -155,8 +156,8 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, return err; } - if (!PAGE_ALIGNED(eq_addr) || !PAGE_ALIGNED(cq_addr) || - !PAGE_ALIGNED(rq_addr) || !PAGE_ALIGNED(sq_addr)) + if (!MANA_PAGE_ALIGNED(eq_addr) || !MANA_PAGE_ALIGNED(cq_addr) || + !MANA_PAGE_ALIGNED(rq_addr) || !MANA_PAGE_ALIGNED(sq_addr)) return -EINVAL; if ((eq_msix_index & VECTOR_MASK) != eq_msix_index) @@ -183,7 +184,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* EQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(eq_addr); + frame_addr = MANA_PFN(eq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); @@ -191,7 +192,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* CQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(cq_addr); + frame_addr = MANA_PFN(cq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); @@ -199,7 +200,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* RQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(rq_addr); + frame_addr = MANA_PFN(rq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); @@ -207,7 +208,7 @@ int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr, /* SQ addr: low 48 bits of frame address */ shmem = (u64 *)ptr; - frame_addr = PHYS_PFN(sq_addr); + frame_addr = MANA_PFN(sq_addr); *shmem = frame_addr & PAGE_FRAME_L48_MASK; all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 27684135bb4d1..35507588a14d5 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -224,7 +224,15 @@ struct gdma_dev { struct auxiliary_device *adev; }; -#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE +/* MANA_PAGE_SIZE is the DMA unit */ +#define MANA_PAGE_SHIFT 12 +#define MANA_PAGE_SIZE BIT(MANA_PAGE_SHIFT) +#define MANA_PAGE_ALIGN(x) ALIGN((x), MANA_PAGE_SIZE) +#define MANA_PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), MANA_PAGE_SIZE) +#define MANA_PFN(a) ((a) >> MANA_PAGE_SHIFT) + +/* Required by HW */ +#define MANA_MIN_QSIZE MANA_PAGE_SIZE #define GDMA_CQE_SIZE 64 #define GDMA_EQE_SIZE 16 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 5da352adf1d68..6e4fee310e865 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -40,7 +40,8 @@ enum TRI_STATE { #define MAX_SEND_BUFFERS_PER_QUEUE 256 -#define EQ_SIZE (8 * PAGE_SIZE) +#define EQ_SIZE (8 * MANA_PAGE_SIZE) + #define LOG2_EQ_THROTTLE 3 #define MAX_PORTS_IN_MANA_DEV 256 From 92b758c0c8fbbca54114bae37702e8c2b6ff7fb0 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 19 Aug 2025 12:04:36 +0000 Subject: [PATCH 34/50] RDMA/mana_ib: Fix bug in creation of dma regions jira LE-3812 commit-author Konstantin Taranov commit e02497fb654689049ba8b46f098f17d5f19e0b3c Use ib_umem_dma_offset() helper to calculate correct dma offset. Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1709560361-26393-2-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky (cherry picked from commit e02497fb654689049ba8b46f098f17d5f19e0b3c) Signed-off-by: Shreeya Patel --- drivers/infiniband/hw/mana/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index faca092456fa3..7840c9e2631cc 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -358,7 +358,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, sizeof(struct gdma_create_dma_region_resp)); create_req->length = umem->length; - create_req->offset_in_page = umem->address & (page_sz - 1); + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; create_req->page_count = num_pages_total; From 8b4a9bda4405e92c08596f54bd274fe7827cfba2 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 18 Aug 2025 21:04:33 +0000 Subject: [PATCH 35/50] RDMA/mana_ib: use the correct page size for mapping user-mode doorbell page jira LE-3812 commit-author Long Li commit 4a3b99bc04e501b816db78f70064e26a01257910 When mapping doorbell page from user-mode, the driver should use the system page size as this memory is allocated via mmap() from user-mode. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Long Li Link: https://patch.msgid.link/1725030993-16213-2-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky (cherry picked from commit 4a3b99bc04e501b816db78f70064e26a01257910) Signed-off-by: Shreeya Patel --- drivers/infiniband/hw/mana/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 7840c9e2631cc..a0affa92975cc 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -460,13 +460,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } From d050eccdb071a4e2b8eb2a5a12082b54f337e7b8 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 19 Aug 2025 12:04:57 +0000 Subject: [PATCH 36/50] RDMA/mana_ib: use the correct page table index based on hardware page size jira LE-3812 commit-author Long Li commit 9e517a8e9d9a303bf9bde35e5c5374795544c152 MANA hardware uses 4k page size. When calculating the page table index, it should use the hardware page size, not the system page size. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Long Li Link: https://patch.msgid.link/1725030993-16213-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky (cherry picked from commit 9e517a8e9d9a303bf9bde35e5c5374795544c152) Signed-off-by: Shreeya Patel --- drivers/infiniband/hw/mana/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index a0affa92975cc..3132705aa192f 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -359,7 +359,7 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, create_req->length = umem->length; create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", From 4b5dd0fdcdb98b7352a5c85ed9fbc308b3f579a7 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 13 Nov 2025 11:58:44 -0500 Subject: [PATCH 37/50] tipc: fix NULL deref in cleanup_bearer() jira VULN-160088 cve CVE-2024-56661 commit-author Eric Dumazet commit b04d86fff66b15c07505d226431f808c15b1703c syzbot found [1] that after blamed commit, ub->ubsock->sk was NULL when attempting the atomic_dec() : atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); Fix this by caching the tipc_net pointer. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 0 UID: 0 PID: 5896 Comm: kworker/0:3 Not tainted 6.13.0-rc1-next-20241203-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: events cleanup_bearer RIP: 0010:read_pnet include/net/net_namespace.h:387 [inline] RIP: 0010:sock_net include/net/sock.h:655 [inline] RIP: 0010:cleanup_bearer+0x1f7/0x280 net/tipc/udp_media.c:820 Code: 18 48 89 d8 48 c1 e8 03 42 80 3c 28 00 74 08 48 89 df e8 3c f7 99 f6 48 8b 1b 48 83 c3 30 e8 f0 e4 60 00 48 89 d8 48 c1 e8 03 <42> 80 3c 28 00 74 08 48 89 df e8 1a f7 99 f6 49 83 c7 e8 48 8b 1b RSP: 0018:ffffc9000410fb70 EFLAGS: 00010206 RAX: 0000000000000006 RBX: 0000000000000030 RCX: ffff88802fe45a00 RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffffc9000410f900 RBP: ffff88807e1f0908 R08: ffffc9000410f907 R09: 1ffff92000821f20 R10: dffffc0000000000 R11: fffff52000821f21 R12: ffff888031d19980 R13: dffffc0000000000 R14: dffffc0000000000 R15: ffff88807e1f0918 FS: 0000000000000000(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000556ca050b000 CR3: 0000000031c0c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 6a2fa13312e5 ("tipc: Fix use-after-free of kernel socket in cleanup_bearer().") Reported-by: syzbot+46aa5474f179dacd1a3b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67508b5f.050a0220.17bd51.0070.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241204170548.4152658-1-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit b04d86fff66b15c07505d226431f808c15b1703c) Signed-off-by: Jonathan Maple --- net/tipc/udp_media.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 3ce73681c629a..4ffba186c0bea 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -803,6 +803,7 @@ static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); struct udp_replicast *rcast, *tmp; + struct tipc_net *tn; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { dst_cache_destroy(&rcast->dst_cache); @@ -810,10 +811,14 @@ static void cleanup_bearer(struct work_struct *work) kfree_rcu(rcast, rcu); } + tn = tipc_net(sock_net(ub->ubsock->sk)); + dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); + + /* Note: could use a call_rcu() to avoid another synchronize_net() */ synchronize_net(); - atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); + atomic_dec(&tn->wq_count); kfree(ub); } From 4e51897b0761af30010039f956a69263dc5fb401 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Wed, 17 Dec 2025 14:32:21 +0000 Subject: [PATCH 38/50] scsi: storvsc: Prefer returning channel with the same CPU as on the I/O issuing CPU jira LE-4535 commit-author Long Li commit b69ffeaa0ae43892683113b3f4ddf156398738b9 When selecting an outgoing channel for I/O, storvsc tries to select a channel with a returning CPU that is not the same as issuing CPU. This worked well in the past, however it doesn't work well when the Hyper-V exposes a large number of channels (up to the number of all CPUs). Use a different CPU for returning channel is not efficient on Hyper-V. Change this behavior by preferring to the channel with the same CPU as the current I/O issuing CPU whenever possible. Tests have shown improvements in newer Hyper-V/Azure environment, and no regression with older Hyper-V/Azure environments. Tested-by: Raheel Abdul Faizy Signed-off-by: Long Li Message-Id: <1759381530-7414-1-git-send-email-longli@linux.microsoft.com> Signed-off-by: Martin K. Petersen (cherry picked from commit b69ffeaa0ae43892683113b3f4ddf156398738b9) Signed-off-by: Shreeya Patel --- drivers/scsi/storvsc_drv.c | 96 ++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 51 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index cc824c6e19a7b..f66dde9adda1c 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1398,14 +1398,19 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, } /* - * Our channel array is sparsley populated and we + * Our channel array could be sparsley populated and we * initiated I/O on a processor/hw-q that does not * currently have a designated channel. Fix this. * The strategy is simple: - * I. Ensure NUMA locality - * II. Distribute evenly (best effort) + * I. Prefer the channel associated with the current CPU + * II. Ensure NUMA locality + * III. Distribute evenly (best effort) */ + /* Prefer the channel on the I/O issuing processor/hw-q */ + if (cpumask_test_cpu(q_num, &stor_device->alloced_cpus)) + return stor_device->stor_chns[q_num]; + node_mask = cpumask_of_node(cpu_to_node(q_num)); num_channels = 0; @@ -1461,59 +1466,48 @@ static int storvsc_do_io(struct hv_device *device, /* See storvsc_change_target_cpu(). */ outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); if (outgoing_channel != NULL) { - if (outgoing_channel->target_cpu == q_num) { - /* - * Ideally, we want to pick a different channel if - * available on the same NUMA node. - */ - node_mask = cpumask_of_node(cpu_to_node(q_num)); - for_each_cpu_wrap(tgt_cpu, - &stor_device->alloced_cpus, q_num + 1) { - if (!cpumask_test_cpu(tgt_cpu, node_mask)) - continue; - if (tgt_cpu == q_num) - continue; - channel = READ_ONCE( - stor_device->stor_chns[tgt_cpu]); - if (channel == NULL) - continue; - if (hv_get_avail_to_write_percent( - &channel->outbound) - > ring_avail_percent_lowater) { - outgoing_channel = channel; - goto found_channel; - } - } + if (hv_get_avail_to_write_percent(&outgoing_channel->outbound) + > ring_avail_percent_lowater) + goto found_channel; - /* - * All the other channels on the same NUMA node are - * busy. Try to use the channel on the current CPU - */ - if (hv_get_avail_to_write_percent( - &outgoing_channel->outbound) - > ring_avail_percent_lowater) + /* + * Channel is busy, try to find a channel on the same NUMA node + */ + node_mask = cpumask_of_node(cpu_to_node(q_num)); + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + if (!cpumask_test_cpu(tgt_cpu, node_mask)) + continue; + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; goto found_channel; + } + } - /* - * If we reach here, all the channels on the current - * NUMA node are busy. Try to find a channel in - * other NUMA nodes - */ - for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { - if (cpumask_test_cpu(tgt_cpu, node_mask)) - continue; - channel = READ_ONCE( - stor_device->stor_chns[tgt_cpu]); - if (channel == NULL) - continue; - if (hv_get_avail_to_write_percent( - &channel->outbound) - > ring_avail_percent_lowater) { - outgoing_channel = channel; - goto found_channel; - } + /* + * If we reach here, all the channels on the current + * NUMA node are busy. Try to find a channel in + * all NUMA nodes + */ + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; + goto found_channel; } } + /* + * If we reach here, all the channels are busy. Use the + * original channel found. + */ } else { spin_lock_irqsave(&stor_device->lock, flags); outgoing_channel = stor_device->stor_chns[q_num]; From e2c7d55d9bc0c23c0bdede62500d78ebd66afc34 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 10 Feb 2026 13:05:38 -0500 Subject: [PATCH 39/50] PCI: Batch BAR sizing operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira SECO-458 commit-author Alex Williamson commit 4453f360862e5d9f0807941d613162c3f7a36559 Toggling memory enable is free on bare metal, but potentially expensive in virtualized environments as the device MMIO spaces are added and removed from the VM address space, including DMA mapping of those spaces through the IOMMU where peer-to-peer is supported. Currently memory decode is disabled around sizing each individual BAR, even for SR-IOV BARs while VF Enable is cleared. This can be better optimized for virtual environments by sizing a set of BARs at once, stashing the resulting mask into an array, while only toggling memory enable once. This also naturally improves the SR-IOV path as the caller becomes responsible for any necessary decode disables while sizing BARs, therefore SR-IOV BARs are sized relying only on the VF Enable rather than toggling the PF memory enable in the command register. Link: https://lore.kernel.org/r/20250120182202.1878581-1-alex.williamson@redhat.com Reported-by: Mitchell Augustin Link: https://lore.kernel.org/r/CAHTA-uYp07FgM6T1OZQKqAdSA5JrZo0ReNEyZgQZub4mDRrV5w@mail.gmail.com Signed-off-by: Alex Williamson Signed-off-by: Bjorn Helgaas Tested-by: Mitchell Augustin Reviewed-by: Mitchell Augustin Reviewed-by: Ilpo JΓ€rvinen (cherry picked from commit 4453f360862e5d9f0807941d613162c3f7a36559) Signed-off-by: Brett Mastbergen --- drivers/pci/iov.c | 8 +++- drivers/pci/pci.h | 4 +- drivers/pci/probe.c | 93 +++++++++++++++++++++++++++++++++------------ 3 files changed, 78 insertions(+), 27 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 14cac6a5598f3..5a26f8fc4319b 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -742,6 +742,7 @@ static int sriov_init(struct pci_dev *dev, int pos) struct pci_sriov *iov; struct resource *res; struct pci_dev *pdev; + u32 sriovbars[PCI_SRIOV_NUM_BARS]; pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl); if (ctrl & PCI_SRIOV_CTRL_VFE) { @@ -778,6 +779,10 @@ static int sriov_init(struct pci_dev *dev, int pos) if (!iov) return -ENOMEM; + /* Sizing SR-IOV BARs with VF Enable cleared - no decode */ + __pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS, + pos + PCI_SRIOV_BAR, sriovbars); + nres = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; @@ -789,7 +794,8 @@ static int sriov_init(struct pci_dev *dev, int pos) bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0; else bar64 = __pci_read_base(dev, pci_bar_unknown, res, - pos + PCI_SRIOV_BAR + i * 4); + pos + PCI_SRIOV_BAR + i * 4, + &sriovbars[i]); if (!res->flags) continue; if (resource_size(res) & (PAGE_SIZE - 1)) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ce956ebfda087..1421ea9e83a9a 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -213,8 +213,10 @@ bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); int pci_setup_device(struct pci_dev *dev); +void __pci_size_stdbars(struct pci_dev *dev, int count, + unsigned int pos, u32 *sizes); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, - struct resource *res, unsigned int reg); + struct resource *res, unsigned int reg, u32 *sizes); void pci_configure_ari(struct pci_dev *dev); void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 5b1b204fd3d43..c0af7d3d31bc7 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -170,40 +170,66 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar) #define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO) +/** + * __pci_size_bars - Read the raw BAR mask for a range of PCI BARs + * @dev: the PCI device + * @count: number of BARs to size + * @pos: starting config space position + * @sizes: array to store mask values + * @rom: indicate whether to use ROM mask, which avoids enabling ROM BARs + * + * Provided @sizes array must be sufficiently sized to store results for + * @count u32 BARs. Caller is responsible for disabling decode to specified + * BAR range around calling this function. This function is intended to avoid + * disabling decode around sizing each BAR individually, which can result in + * non-trivial overhead in virtualized environments with very large PCI BARs. + */ +static void __pci_size_bars(struct pci_dev *dev, int count, + unsigned int pos, u32 *sizes, bool rom) +{ + u32 orig, mask = rom ? PCI_ROM_ADDRESS_MASK : ~0; + int i; + + for (i = 0; i < count; i++, pos += 4, sizes++) { + pci_read_config_dword(dev, pos, &orig); + pci_write_config_dword(dev, pos, mask); + pci_read_config_dword(dev, pos, sizes); + pci_write_config_dword(dev, pos, orig); + } +} + +void __pci_size_stdbars(struct pci_dev *dev, int count, + unsigned int pos, u32 *sizes) +{ + __pci_size_bars(dev, count, pos, sizes, false); +} + +static void __pci_size_rom(struct pci_dev *dev, unsigned int pos, u32 *sizes) +{ + __pci_size_bars(dev, 1, pos, sizes, true); +} + /** * __pci_read_base - Read a PCI BAR * @dev: the PCI device * @type: type of the BAR * @res: resource buffer to be filled in * @pos: BAR position in the config space + * @sizes: array of one or more pre-read BAR masks * * Returns 1 if the BAR is 64-bit, or 0 if 32-bit. */ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, - struct resource *res, unsigned int pos) + struct resource *res, unsigned int pos, u32 *sizes) { - u32 l = 0, sz = 0, mask; + u32 l = 0, sz; u64 l64, sz64, mask64; - u16 orig_cmd; struct pci_bus_region region, inverted_region; - mask = type ? PCI_ROM_ADDRESS_MASK : ~0; - - /* No printks while decoding is disabled! */ - if (!dev->mmio_always_on) { - pci_read_config_word(dev, PCI_COMMAND, &orig_cmd); - if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) { - pci_write_config_word(dev, PCI_COMMAND, - orig_cmd & ~PCI_COMMAND_DECODE_ENABLE); - } - } - res->name = pci_name(dev); pci_read_config_dword(dev, pos, &l); - pci_write_config_dword(dev, pos, l | mask); - pci_read_config_dword(dev, pos, &sz); - pci_write_config_dword(dev, pos, l); + sz = sizes[0]; /* * All bits set in sz means the device isn't working properly. @@ -243,18 +269,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, if (res->flags & IORESOURCE_MEM_64) { pci_read_config_dword(dev, pos + 4, &l); - pci_write_config_dword(dev, pos + 4, ~0); - pci_read_config_dword(dev, pos + 4, &sz); - pci_write_config_dword(dev, pos + 4, l); + sz = sizes[1]; l64 |= ((u64)l << 32); sz64 |= ((u64)sz << 32); mask64 |= ((u64)~0 << 32); } - if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE)) - pci_write_config_word(dev, PCI_COMMAND, orig_cmd); - if (!sz64) goto fail; @@ -326,7 +347,11 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) { + u32 rombar, stdbars[PCI_STD_NUM_BARS]; unsigned int pos, reg; + u16 orig_cmd; + + BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS); if (dev->non_compliant_bars) return; @@ -335,10 +360,28 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) if (dev->is_virtfn) return; + /* No printks while decoding is disabled! */ + if (!dev->mmio_always_on) { + pci_read_config_word(dev, PCI_COMMAND, &orig_cmd); + if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) { + pci_write_config_word(dev, PCI_COMMAND, + orig_cmd & ~PCI_COMMAND_DECODE_ENABLE); + } + } + + __pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars); + if (rom) + __pci_size_rom(dev, rom, &rombar); + + if (!dev->mmio_always_on && + (orig_cmd & PCI_COMMAND_DECODE_ENABLE)) + pci_write_config_word(dev, PCI_COMMAND, orig_cmd); + for (pos = 0; pos < howmany; pos++) { struct resource *res = &dev->resource[pos]; reg = PCI_BASE_ADDRESS_0 + (pos << 2); - pos += __pci_read_base(dev, pci_bar_unknown, res, reg); + pos += __pci_read_base(dev, pci_bar_unknown, + res, reg, &stdbars[pos]); } if (rom) { @@ -346,7 +389,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) dev->rom_base_reg = rom; res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | IORESOURCE_READONLY | IORESOURCE_SIZEALIGN; - __pci_read_base(dev, pci_bar_mem32, res, rom); + __pci_read_base(dev, pci_bar_mem32, res, rom, &rombar); } } From 7735c1622bc83277240177529fb0ba6cbd78457d Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Wed, 25 Feb 2026 15:21:07 +0100 Subject: [PATCH 40/50] libeth: add Tx buffer completion helpers jira KERNEL-168 commit-author Alexander Lobakin commit 080d72f471c86f8906845bc822051f5790d0a90d Software-side Tx buffers for storing DMA, frame size, skb pointers etc. are pretty much generic and every driver defines them the same way. The same can be said for software Tx completions -- same napi_consume_skb()s and all that... Add a couple simple wrappers for doing that to stop repeating the old tale at least within the Intel code. Drivers are free to use 'priv' member at the end of the structure. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit 080d72f471c86f8906845bc822051f5790d0a90d) Signed-off-by: Roxana Nicolescu --- include/net/libeth/tx.h | 129 +++++++++++++++++++++++++++++++++++++ include/net/libeth/types.h | 25 +++++++ 2 files changed, 154 insertions(+) create mode 100644 include/net/libeth/tx.h create mode 100644 include/net/libeth/types.h diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h new file mode 100644 index 0000000000000..35614f9523f60 --- /dev/null +++ b/include/net/libeth/tx.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TX_H +#define __LIBETH_TX_H + +#include + +#include + +/* Tx buffer completion */ + +/** + * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion + * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required + * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() + * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA + * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + */ +enum libeth_sqe_type { + LIBETH_SQE_EMPTY = 0U, + LIBETH_SQE_CTX, + LIBETH_SQE_SLAB, + LIBETH_SQE_FRAG, + LIBETH_SQE_SKB, +}; + +/** + * struct libeth_sqe - represents a Send Queue Element / Tx buffer + * @type: type of the buffer, see the enum above + * @rs_idx: index of the last buffer from the batch this one was sent in + * @raw: slab buffer to free via kfree() + * @skb: &sk_buff to consume + * @dma: DMA address to unmap + * @len: length of the mapped region to unmap + * @nr_frags: number of frags in the frame this buffer belongs to + * @packets: number of physical packets sent for this frame + * @bytes: number of physical bytes sent for this frame + * @priv: driver-private scratchpad + */ +struct libeth_sqe { + enum libeth_sqe_type type:32; + u32 rs_idx; + + union { + void *raw; + struct sk_buff *skb; + }; + + DEFINE_DMA_UNMAP_ADDR(dma); + DEFINE_DMA_UNMAP_LEN(len); + + u32 nr_frags; + u32 packets; + u32 bytes; + + unsigned long priv; +} __aligned_largest; + +/** + * LIBETH_SQE_CHECK_PRIV - check the driver's private SQE data + * @p: type or name of the object the driver wants to fit into &libeth_sqe + * + * Make sure the driver's private data fits into libeth_sqe::priv. To be used + * right after its declaration. + */ +#define LIBETH_SQE_CHECK_PRIV(p) \ + static_assert(sizeof(p) <= sizeof_field(struct libeth_sqe, priv)) + +/** + * struct libeth_cq_pp - completion queue poll params + * @dev: &device to perform DMA unmapping + * @ss: onstack NAPI stats to fill + * @napi: whether it's called from the NAPI context + * + * libeth uses this structure to access objects needed for performing full + * Tx complete operation without passing lots of arguments and change the + * prototypes each time a new one is added. + */ +struct libeth_cq_pp { + struct device *dev; + struct libeth_sq_napi_stats *ss; + + bool napi; +}; + +/** + * libeth_tx_complete - perform Tx completion for one SQE + * @sqe: SQE to complete + * @cp: poll params + * + * Do Tx complete for all the types of buffers, incl. freeing, unmapping, + * updating the stats etc. + */ +static inline void libeth_tx_complete(struct libeth_sqe *sqe, + const struct libeth_cq_pp *cp) +{ + switch (sqe->type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_SKB: + case LIBETH_SQE_FRAG: + case LIBETH_SQE_SLAB: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (sqe->type) { + case LIBETH_SQE_SKB: + cp->ss->packets += sqe->packets; + cp->ss->bytes += sqe->bytes; + + napi_consume_skb(sqe->skb, cp->napi); + break; + case LIBETH_SQE_SLAB: + kfree(sqe->raw); + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +#endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h new file mode 100644 index 0000000000000..603825e451339 --- /dev/null +++ b/include/net/libeth/types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TYPES_H +#define __LIBETH_TYPES_H + +#include + +/** + * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @raw: alias to access all the fields as an array + */ +struct libeth_sq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +#endif /* __LIBETH_TYPES_H */ From cf913f0e7caef73388966e7942faa924613ad012 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 4 Sep 2024 17:47:44 +0200 Subject: [PATCH 41/50] idpf: convert to libeth Tx buffer completion jira KERNEL-168 commit-author Alexander Lobakin commit d9028db618a63e4bbe63eb56c0b0db2b4cb924bc upstream-diff | adjusted context due to missing #include introduced in commit 1b1b26208515 ("idpf: reuse libeth's definitions of parsed ptype structures") part of "convert RX to libeth" patchset. &idpf_tx_buffer is almost identical to the previous generations, as well as the way it's handled. Moreover, relying on dma_unmap_addr() and !!buf->skb instead of explicit defining of buffer's type was never good. Use the newly added libeth helpers to do it properly and reduce the copy-paste around the Tx code. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit d9028db618a63e4bbe63eb56c0b0db2b4cb924bc) Signed-off-by: Roxana Nicolescu Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 83 +++---- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 206 +++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 50 +---- 3 files changed, 107 insertions(+), 232 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 3cf493a51b8d0..82fbe926c12bf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2023 Intel Corporation */ +#include + #include "idpf.h" /** @@ -222,6 +224,7 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); dma_unmap_addr_set(tx_buf, dma, dma); + tx_buf->type = LIBETH_SQE_FRAG; /* align size to end of page */ max_data += -dma & (IDPF_TX_MAX_READ_REQ_SIZE - 1); @@ -243,6 +246,8 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, i = 0; } + tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; + dma += max_data; size -= max_data; @@ -280,13 +285,13 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets, size, td_tag); - IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i); + first->type = LIBETH_SQE_SKB; + first->rs_idx = i; - /* set next_to_watch value indicating a packet is present */ - first->next_to_watch = tx_desc; + IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i); nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - netdev_tx_sent_queue(nq, first->bytecount); + netdev_tx_sent_queue(nq, first->bytes); idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more()); } @@ -304,8 +309,7 @@ idpf_tx_singleq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_base_tx_ctx_desc *ctx_desc; int ntu = txq->next_to_use; - memset(&txq->tx_buf[ntu], 0, sizeof(struct idpf_tx_buf)); - txq->tx_buf[ntu].ctx_entry = true; + txq->tx_buf[ntu].type = LIBETH_SQE_CTX; ctx_desc = &txq->base_ctx[ntu]; @@ -399,11 +403,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, first->skb = skb; if (tso) { - first->gso_segs = offload.tso_segs; - first->bytecount = skb->len + ((first->gso_segs - 1) * offload.tso_hdr_len); + first->packets = offload.tso_segs; + first->bytes = skb->len + ((first->packets - 1) * offload.tso_hdr_len); } else { - first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); - first->gso_segs = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + first->packets = 1; } idpf_tx_singleq_map(tx_q, first, &offload); @@ -423,10 +427,15 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, int *cleaned) { - unsigned int total_bytes = 0, total_pkts = 0; + struct libeth_sq_napi_stats ss = { }; struct idpf_base_tx_desc *tx_desc; u32 budget = tx_q->clean_budget; s16 ntc = tx_q->next_to_clean; + struct libeth_cq_pp cp = { + .dev = tx_q->dev, + .ss = &ss, + .napi = napi_budget, + }; struct idpf_netdev_priv *np; struct idpf_tx_buf *tx_buf; struct netdev_queue *nq; @@ -444,47 +453,23 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, * such. We can skip this descriptor since there is no buffer * to clean. */ - if (tx_buf->ctx_entry) { - /* Clear this flag here to avoid stale flag values when - * this buffer is used for actual data in the future. - * There are cases where the tx_buf struct / the flags - * field will not be cleared before being reused. - */ - tx_buf->ctx_entry = false; + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) { + tx_buf->type = LIBETH_SQE_EMPTY; goto fetch_next_txq_desc; } - /* if next_to_watch is not set then no work pending */ - eop_desc = (struct idpf_base_tx_desc *)tx_buf->next_to_watch; - if (!eop_desc) - break; - - /* prevent any other reads prior to eop_desc */ + /* prevent any other reads prior to type */ smp_rmb(); + eop_desc = &tx_q->base_tx[tx_buf->rs_idx]; + /* if the descriptor isn't done, no work yet to do */ if (!(eop_desc->qw1 & cpu_to_le64(IDPF_TX_DESC_DTYPE_DESC_DONE))) break; - /* clear next_to_watch to prevent false hangs */ - tx_buf->next_to_watch = NULL; - /* update the statistics for this packet */ - total_bytes += tx_buf->bytecount; - total_pkts += tx_buf->gso_segs; - - napi_consume_skb(tx_buf->skb, napi_budget); - - /* unmap skb header data */ - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - - /* clear tx_buf data */ - tx_buf->skb = NULL; - dma_unmap_len_set(tx_buf, len, 0); + libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ while (tx_desc != eop_desc) { @@ -498,13 +483,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, } /* unmap any remaining paged data */ - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } + libeth_tx_complete(tx_buf, &cp); } /* update budget only if we did something */ @@ -524,11 +503,11 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, ntc += tx_q->desc_count; tx_q->next_to_clean = ntc; - *cleaned += total_pkts; + *cleaned += ss.packets; u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_add(&tx_q->q_stats.packets, total_pkts); - u64_stats_add(&tx_q->q_stats.bytes, total_bytes); + u64_stats_add(&tx_q->q_stats.packets, ss.packets); + u64_stats_add(&tx_q->q_stats.bytes, ss.bytes); u64_stats_update_end(&tx_q->stats_sync); np = netdev_priv(tx_q->netdev); @@ -536,7 +515,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, dont_wake = np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); - __netif_txq_completed_wake(nq, total_pkts, total_bytes, + __netif_txq_completed_wake(nq, ss.packets, ss.bytes, IDPF_DESC_UNUSED(tx_q), IDPF_TX_WAKE_THRESH, dont_wake); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index e163e54d1c31e..65cb09beb35e8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -1,9 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2023 Intel Corporation */ +#include + #include "idpf.h" #include "idpf_virtchnl.h" +struct idpf_tx_stash { + struct hlist_node hlist; + struct libeth_sqe buf; +}; + +#define idpf_tx_buf_compl_tag(buf) (*(int *)&(buf)->priv) +LIBETH_SQE_CHECK_PRIV(int); + static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); @@ -58,41 +68,18 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) } } -/** - * idpf_tx_buf_rel - Release a Tx buffer - * @tx_q: the queue that owns the buffer - * @tx_buf: the buffer to free - */ -static void idpf_tx_buf_rel(struct idpf_tx_queue *tx_q, - struct idpf_tx_buf *tx_buf) -{ - if (tx_buf->skb) { - if (dma_unmap_len(tx_buf, len)) - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_buf->skb); - } else if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - } - - tx_buf->next_to_watch = NULL; - tx_buf->skb = NULL; - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - dma_unmap_len_set(tx_buf, len, 0); -} - /** * idpf_tx_buf_rel_all - Free any empty Tx buffers * @txq: queue to be cleaned */ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { + struct libeth_sq_napi_stats ss = { }; struct idpf_buf_lifo *buf_stack; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; u16 i; /* Buffers already cleared, nothing to do */ @@ -101,7 +88,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) /* Free all the Tx buffer sk_buffs */ for (i = 0; i < txq->desc_count; i++) - idpf_tx_buf_rel(txq, &txq->tx_buf[i]); + libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); txq->tx_buf = NULL; @@ -201,10 +188,6 @@ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) if (!tx_q->tx_buf) return -ENOMEM; - /* Initialize tx_bufs with invalid completion tags */ - for (i = 0; i < tx_q->desc_count; i++) - tx_q->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) return 0; @@ -1651,37 +1634,6 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q) wake_up(&vport->sw_marker_wq); } -/** - * idpf_tx_splitq_clean_hdr - Clean TX buffer resources for header portion of - * packet - * @tx_q: tx queue to clean buffer from - * @tx_buf: buffer to be cleaned - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @napi_budget: Used to determine if we are in netpoll - */ -static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q, - struct idpf_tx_buf *tx_buf, - struct idpf_cleaned_stats *cleaned, - int napi_budget) -{ - napi_consume_skb(tx_buf->skb, napi_budget); - - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - - dma_unmap_len_set(tx_buf, len, 0); - } - - /* clear tx_buf data */ - tx_buf->skb = NULL; - - cleaned->bytes += tx_buf->bytecount; - cleaned->packets += tx_buf->gso_segs; -} - /** * idpf_tx_clean_stashed_bufs - clean bufs that were stored for * out of order completions @@ -1692,28 +1644,25 @@ static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q, */ static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, u16 compl_tag, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { struct idpf_tx_stash *stash; struct hlist_node *tmp_buf; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = cleaned, + .napi = budget, + }; /* Buffer completion */ hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, hlist, compl_tag) { - if (unlikely(stash->buf.compl_tag != (int)compl_tag)) + if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != + (int)compl_tag)) continue; - if (stash->buf.skb) { - idpf_tx_splitq_clean_hdr(txq, &stash->buf, cleaned, - budget); - } else if (dma_unmap_len(&stash->buf, len)) { - dma_unmap_page(txq->dev, - dma_unmap_addr(&stash->buf, dma), - dma_unmap_len(&stash->buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(&stash->buf, len, 0); - } + libeth_tx_complete(&stash->buf, &cp); /* Push shadow buf back onto stack */ idpf_buf_lifo_push(&txq->stash->buf_stack, stash); @@ -1733,8 +1682,7 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, { struct idpf_tx_stash *stash; - if (unlikely(!dma_unmap_addr(tx_buf, dma) && - !dma_unmap_len(tx_buf, len))) + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) return 0; stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); @@ -1747,20 +1695,18 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, /* Store buffer params in shadow buffer */ stash->buf.skb = tx_buf->skb; - stash->buf.bytecount = tx_buf->bytecount; - stash->buf.gso_segs = tx_buf->gso_segs; + stash->buf.bytes = tx_buf->bytes; + stash->buf.packets = tx_buf->packets; + stash->buf.type = tx_buf->type; dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - stash->buf.compl_tag = tx_buf->compl_tag; + idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); /* Add buffer to buf_hash table to be freed later */ hash_add(txq->stash->sched_buf_hash, &stash->hlist, - stash->buf.compl_tag); - - memset(tx_buf, 0, sizeof(struct idpf_tx_buf)); + idpf_tx_buf_compl_tag(&stash->buf)); - /* Reinitialize buf_id portion of tag */ - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + tx_buf->type = LIBETH_SQE_EMPTY; return 0; } @@ -1796,12 +1742,17 @@ do { \ */ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, bool descs_only) { union idpf_tx_flex_desc *next_pending_desc = NULL; union idpf_tx_flex_desc *tx_desc; s16 ntc = tx_q->next_to_clean; + struct libeth_cq_pp cp = { + .dev = tx_q->dev, + .ss = cleaned, + .napi = napi_budget, + }; struct idpf_tx_buf *tx_buf; tx_desc = &tx_q->flex_tx[ntc]; @@ -1817,13 +1768,10 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, * invalid completion tag since no buffer was used. We can * skip this descriptor since there is no buffer to clean. */ - if (unlikely(tx_buf->compl_tag == IDPF_SPLITQ_TX_INVAL_COMPL_TAG)) + if (tx_buf->type <= LIBETH_SQE_CTX) goto fetch_next_txq_desc; - eop_desc = (union idpf_tx_flex_desc *)tx_buf->next_to_watch; - - /* clear next_to_watch to prevent false hangs */ - tx_buf->next_to_watch = NULL; + eop_desc = &tx_q->flex_tx[tx_buf->rs_idx]; if (descs_only) { if (idpf_stash_flow_sch_buffers(tx_q, tx_buf)) @@ -1840,8 +1788,7 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, } } } else { - idpf_tx_splitq_clean_hdr(tx_q, tx_buf, cleaned, - napi_budget); + libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ while (tx_desc != eop_desc) { @@ -1849,13 +1796,7 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, tx_desc, tx_buf); /* unmap any remaining paged data */ - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } + libeth_tx_complete(tx_buf, &cp); } } @@ -1891,30 +1832,26 @@ do { \ * this completion tag. */ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; u16 ntc = txq->next_to_clean; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = cleaned, + .napi = budget, + }; u16 num_descs_cleaned = 0; u16 orig_idx = idx; tx_buf = &txq->tx_buf[idx]; + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) + return false; - while (tx_buf->compl_tag == (int)compl_tag) { - if (tx_buf->skb) { - idpf_tx_splitq_clean_hdr(txq, tx_buf, cleaned, budget); - } else if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(txq->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } - - memset(tx_buf, 0, sizeof(struct idpf_tx_buf)); - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + while (idpf_tx_buf_compl_tag(tx_buf) == (int)compl_tag) { + libeth_tx_complete(tx_buf, &cp); num_descs_cleaned++; idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); @@ -1961,7 +1898,7 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, */ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct idpf_splitq_tx_compl_desc *desc, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { u16 compl_tag; @@ -2004,7 +1941,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, ntc -= complq->desc_count; do { - struct idpf_cleaned_stats cleaned_stats = { }; + struct libeth_sq_napi_stats cleaned_stats = { }; struct idpf_tx_queue *tx_q; int rel_tx_qid; u16 hw_head; @@ -2275,6 +2212,12 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, struct idpf_tx_buf *first, u16 idx) { + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + u64_stats_update_begin(&txq->stats_sync); u64_stats_inc(&txq->q_stats.dma_map_errs); u64_stats_update_end(&txq->stats_sync); @@ -2284,7 +2227,7 @@ void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, struct idpf_tx_buf *tx_buf; tx_buf = &txq->tx_buf[idx]; - idpf_tx_buf_rel(txq, tx_buf); + libeth_tx_complete(tx_buf, &cp); if (tx_buf == first) break; if (idx == 0) @@ -2373,7 +2316,8 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (dma_mapping_error(tx_q->dev, dma)) return idpf_tx_dma_map_error(tx_q, skb, first, i); - tx_buf->compl_tag = params->compl_tag; + idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; + tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); @@ -2447,8 +2391,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, * simply pass over these holes and finish cleaning the * rest of the packet. */ - memset(&tx_q->tx_buf[i], 0, sizeof(struct idpf_tx_buf)); - tx_q->tx_buf[i].compl_tag = params->compl_tag; + tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, @@ -2493,19 +2436,19 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, /* record SW timestamp if HW timestamp is not available */ skb_tx_timestamp(skb); + first->type = LIBETH_SQE_SKB; + /* write last descriptor with RS and EOP bits */ + first->rs_idx = i; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); - /* set next_to_watch value indicating a packet is present */ - first->next_to_watch = tx_desc; - tx_q->txq_grp->num_completions_pending++; /* record bytecount for BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - netdev_tx_sent_queue(nq, first->bytecount); + netdev_tx_sent_queue(nq, first->bytes); idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more()); } @@ -2705,8 +2648,7 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - memset(&txq->tx_buf[i], 0, sizeof(struct idpf_tx_buf)); - txq->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + txq->tx_buf[i].type = LIBETH_SQE_CTX; /* grab the next descriptor */ desc = &txq->flex_ctx[i]; @@ -2790,12 +2732,12 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, first->skb = skb; if (tso) { - first->gso_segs = tx_params.offload.tso_segs; - first->bytecount = skb->len + - ((first->gso_segs - 1) * tx_params.offload.tso_hdr_len); + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); } else { - first->gso_segs = 1; - first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index f119f240d21cd..9ead6a55f9cbc 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -137,7 +137,6 @@ do { \ (txq)->num_completions_pending - (txq)->complq->num_completions) #define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -#define IDPF_SPLITQ_TX_INVAL_COMPL_TAG -1 /* Adjust the generation for the completion tag and wrap if necessary */ #define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ @@ -155,47 +154,7 @@ union idpf_tx_flex_desc { struct idpf_flex_tx_sched_desc flow; /* flow based scheduling */ }; -/** - * struct idpf_tx_buf - * @next_to_watch: Next descriptor to clean - * @skb: Pointer to the skb - * @dma: DMA address - * @len: DMA length - * @bytecount: Number of bytes - * @gso_segs: Number of GSO segments - * @compl_tag: Splitq only, unique identifier for a buffer. Used to compare - * with completion tag returned in buffer completion event. - * Because the completion tag is expected to be the same in all - * data descriptors for a given packet, and a single packet can - * span multiple buffers, we need this field to track all - * buffers associated with this completion tag independently of - * the buf_id. The tag consists of a N bit buf_id and M upper - * order "generation bits". See compl_tag_bufid_m and - * compl_tag_gen_s in struct idpf_queue. We'll use a value of -1 - * to indicate the tag is not valid. - * @ctx_entry: Singleq only. Used to indicate the corresponding entry - * in the descriptor ring was used for a context descriptor and - * this buffer entry should be skipped. - */ -struct idpf_tx_buf { - void *next_to_watch; - struct sk_buff *skb; - DEFINE_DMA_UNMAP_ADDR(dma); - DEFINE_DMA_UNMAP_LEN(len); - unsigned int bytecount; - unsigned short gso_segs; - - union { - int compl_tag; - - bool ctx_entry; - }; -}; - -struct idpf_tx_stash { - struct hlist_node hlist; - struct idpf_tx_buf buf; -}; +#define idpf_tx_buf libeth_sqe /** * struct idpf_buf_lifo - LIFO for managing OOO completions @@ -589,11 +548,6 @@ struct idpf_tx_queue_stats { u64_stats_t dma_map_errs; }; -struct idpf_cleaned_stats { - u32 packets; - u32 bytes; -}; - #define IDPF_ITR_DYNAMIC 1 #define IDPF_ITR_MAX 0x1FE0 #define IDPF_ITR_20K 0x0032 @@ -771,7 +725,7 @@ struct idpf_tx_queue { void *desc_ring; }; - struct idpf_tx_buf *tx_buf; + struct libeth_sqe *tx_buf; struct idpf_txq_group *txq_grp; struct device *dev; void __iomem *tail; From c0eb109bf020f5c14a7e6eb358c0318456472bcf Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Wed, 25 Feb 2026 15:37:35 +0100 Subject: [PATCH 42/50] netdevice: add netdev_tx_reset_subqueue() shorthand jira KERNEL-168 commit-author Alexander Lobakin commit 3dc95a3edd0a86b4a59670b3fafcc64c7d83e2e7 Add a shorthand similar to other net*_subqueue() helpers for resetting the queue by its index w/o obtaining &netdev_tx_queue beforehand manually. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit 3dc95a3edd0a86b4a59670b3fafcc64c7d83e2e7) Signed-off-by: Roxana Nicolescu --- include/linux/netdevice.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cef072e7cfbbc..de45924a74c49 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3699,6 +3699,17 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) #endif } +/** + * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue + * @dev: network device + * @qid: stack index of the queue to reset + */ +static inline void netdev_tx_reset_subqueue(const struct net_device *dev, + u32 qid) +{ + netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid)); +} + /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device @@ -3708,7 +3719,7 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) */ static inline void netdev_reset_queue(struct net_device *dev_queue) { - netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); + netdev_tx_reset_subqueue(dev_queue, 0); } /** From be3defb4af43c48394d9509258bef2f85ebd5bad Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Wed, 4 Sep 2024 17:47:46 +0200 Subject: [PATCH 43/50] idpf: refactor Tx completion routines jira KERNEL-168 commit-author Joshua Hay commit 24eb35b15152ed6a2473019413b86b8f1c9714be upstream-diff | adjusted context in .h file around struct idpf_compl_queue due to missing commit 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") Add a mechanism to guard against stashing partial packets into the hash table to make the driver more robust, with more efficient decision making when cleaning. Don't stash partial packets. This can happen when an RE (Report Event) completion is received in flow scheduling mode, or when an out of order RS (Report Status) completion is received. The first buffer with the skb is stashed, but some or all of its frags are not because the stack is out of reserve buffers. This leaves the ring in a weird state since the frags are still on the ring. Use the field libeth_sqe::nr_frags to track the number of fragments/tx_bufs representing the packet. The clean routines check to make sure there are enough reserve buffers on the stack before stashing any part of the packet. If there are not, next_to_clean is left pointing to the first buffer of the packet that failed to be stashed. This leaves the whole packet on the ring, and the next time around, cleaning will start from this packet. An RS completion is still expected for this packet in either case. So instead of being cleaned from the hash table, it will be cleaned from the ring directly. This should all still be fine since the DESC_UNUSED and BUFS_UNUSED will reflect the state of the ring. If we ever fall below the thresholds, the TxQ will still be stopped, giving the completion queue time to catch up. This may lead to stopping the queue more frequently, but it guarantees the Tx ring will always be in a good state. Also, always use the idpf_tx_splitq_clean function to clean descriptors, i.e. use it from clean_buf_ring as well. This way we avoid duplicating the logic and make sure we're using the same reserve buffers guard rail. This does require a switch from the s16 next_to_clean overflow descriptor ring wrap calculation to u16 and the normal ring size check. Signed-off-by: Joshua Hay Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit 24eb35b15152ed6a2473019413b86b8f1c9714be) Signed-off-by: Roxana Nicolescu Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 24 +-- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 168 +++++++++++------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 +- 3 files changed, 122 insertions(+), 76 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 82fbe926c12bf..762c8af258522 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -238,15 +238,16 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, offsets, max_data, td_tag); - tx_desc++; - i++; - - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = &tx_q->tx_buf[0]; tx_desc = &tx_q->base_tx[0]; i = 0; + } else { + tx_buf++; + tx_desc++; } - tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; + tx_buf->type = LIBETH_SQE_EMPTY; dma += max_data; size -= max_data; @@ -260,12 +261,14 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets, size, td_tag); - tx_desc++; - i++; - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = &tx_q->tx_buf[0]; tx_desc = &tx_q->base_tx[0]; i = 0; + } else { + tx_buf++; + tx_desc++; } size = skb_frag_size(frag); @@ -273,8 +276,6 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, dma = skb_frag_dma_map(tx_q->dev, frag, 0, size, DMA_TO_DEVICE); - - tx_buf = &tx_q->tx_buf[i]; } skb_tx_timestamp(first->skb); @@ -458,6 +459,9 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, goto fetch_next_txq_desc; } + if (unlikely(tx_buf->type != LIBETH_SQE_SKB)) + break; + /* prevent any other reads prior to type */ smp_rmb(); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 65cb09beb35e8..e5ea2e40d9215 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -11,8 +11,8 @@ struct idpf_tx_stash { struct libeth_sqe buf; }; -#define idpf_tx_buf_compl_tag(buf) (*(int *)&(buf)->priv) -LIBETH_SQE_CHECK_PRIV(int); +#define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) +LIBETH_SQE_CHECK_PRIV(u32); static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); @@ -76,11 +76,13 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { struct libeth_sq_napi_stats ss = { }; struct idpf_buf_lifo *buf_stack; + struct idpf_tx_stash *stash; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - u16 i; + struct hlist_node *tmp; + u32 i, tag; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) @@ -100,6 +102,20 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) if (!buf_stack->bufs) return; + /* + * If a Tx timeout occurred, there are potentially still bufs in the + * hash table, free them here. + */ + hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash, + hlist) { + if (!stash) + continue; + + libeth_tx_complete(&stash->buf, &cp); + hash_del(&stash->hlist); + idpf_buf_lifo_push(buf_stack, stash); + } + for (i = 0; i < buf_stack->size; i++) kfree(buf_stack->bufs[i]); @@ -116,6 +132,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) { idpf_tx_buf_rel_all(txq); + netdev_tx_reset_subqueue(txq->netdev, txq->idx); if (!txq->desc_ring) return; @@ -1658,16 +1675,14 @@ static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, /* Buffer completion */ hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, hlist, compl_tag) { - if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != - (int)compl_tag)) + if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag)) continue; + hash_del(&stash->hlist); libeth_tx_complete(&stash->buf, &cp); /* Push shadow buf back onto stack */ idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - - hash_del(&stash->hlist); } } @@ -1698,6 +1713,7 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, stash->buf.bytes = tx_buf->bytes; stash->buf.packets = tx_buf->packets; stash->buf.type = tx_buf->type; + stash->buf.nr_frags = tx_buf->nr_frags; dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); @@ -1713,9 +1729,8 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ - (ntc)++; \ - if (unlikely(!(ntc))) { \ - ntc -= (txq)->desc_count; \ + if (unlikely(++(ntc) == (txq)->desc_count)) { \ + ntc = 0; \ buf = (txq)->tx_buf; \ desc = &(txq)->flex_tx[0]; \ } else { \ @@ -1739,59 +1754,66 @@ do { \ * Separate packet completion events will be reported on the completion queue, * and the buffers will be cleaned separately. The stats are not updated from * this function when using flow-based scheduling. + * + * Furthermore, in flow scheduling mode, check to make sure there are enough + * reserve buffers to stash the packet. If there are not, return early, which + * will leave next_to_clean pointing to the packet that failed to be stashed. + * + * Return: false in the scenario above, true otherwise. */ -static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, +static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, struct libeth_sq_napi_stats *cleaned, bool descs_only) { union idpf_tx_flex_desc *next_pending_desc = NULL; union idpf_tx_flex_desc *tx_desc; - s16 ntc = tx_q->next_to_clean; + u32 ntc = tx_q->next_to_clean; struct libeth_cq_pp cp = { .dev = tx_q->dev, .ss = cleaned, .napi = napi_budget, }; struct idpf_tx_buf *tx_buf; + bool clean_complete = true; tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; tx_buf = &tx_q->tx_buf[ntc]; - ntc -= tx_q->desc_count; while (tx_desc != next_pending_desc) { - union idpf_tx_flex_desc *eop_desc; + u32 eop_idx; /* If this entry in the ring was used as a context descriptor, - * it's corresponding entry in the buffer ring will have an - * invalid completion tag since no buffer was used. We can - * skip this descriptor since there is no buffer to clean. + * it's corresponding entry in the buffer ring is reserved. We + * can skip this descriptor since there is no buffer to clean. */ if (tx_buf->type <= LIBETH_SQE_CTX) goto fetch_next_txq_desc; - eop_desc = &tx_q->flex_tx[tx_buf->rs_idx]; + if (unlikely(tx_buf->type != LIBETH_SQE_SKB)) + break; + + eop_idx = tx_buf->rs_idx; if (descs_only) { - if (idpf_stash_flow_sch_buffers(tx_q, tx_buf)) + if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) { + clean_complete = false; goto tx_splitq_clean_out; + } + + idpf_stash_flow_sch_buffers(tx_q, tx_buf); - while (tx_desc != eop_desc) { + while (ntc != eop_idx) { idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); - - if (dma_unmap_len(tx_buf, len)) { - if (idpf_stash_flow_sch_buffers(tx_q, - tx_buf)) - goto tx_splitq_clean_out; - } + idpf_stash_flow_sch_buffers(tx_q, tx_buf); } } else { libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ - while (tx_desc != eop_desc) { + while (ntc != eop_idx) { idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); @@ -1805,8 +1827,9 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, } tx_splitq_clean_out: - ntc += tx_q->desc_count; tx_q->next_to_clean = ntc; + + return clean_complete; } #define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ @@ -1837,48 +1860,63 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, { u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; - u16 ntc = txq->next_to_clean; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = cleaned, .napi = budget, }; - u16 num_descs_cleaned = 0; - u16 orig_idx = idx; + u16 ntc, orig_idx = idx; tx_buf = &txq->tx_buf[idx]; - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) + + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX || + idpf_tx_buf_compl_tag(tx_buf) != compl_tag)) return false; - while (idpf_tx_buf_compl_tag(tx_buf) == (int)compl_tag) { + if (tx_buf->type == LIBETH_SQE_SKB) libeth_tx_complete(tx_buf, &cp); - num_descs_cleaned++; + idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + + while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) { + libeth_tx_complete(tx_buf, &cp); idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); } - /* If we didn't clean anything on the ring for this completion, there's - * nothing more to do. - */ - if (unlikely(!num_descs_cleaned)) - return false; - - /* Otherwise, if we did clean a packet on the ring directly, it's safe - * to assume that the descriptors starting from the original - * next_to_clean up until the previously cleaned packet can be reused. - * Therefore, we will go back in the ring and stash any buffers still - * in the ring into the hash table to be cleaned later. + /* + * It's possible the packet we just cleaned was an out of order + * completion, which means we can stash the buffers starting from + * the original next_to_clean and reuse the descriptors. We need + * to compare the descriptor ring next_to_clean packet's "first" buffer + * to the "first" buffer of the packet we just cleaned to determine if + * this is the case. Howevever, next_to_clean can point to either a + * reserved buffer that corresponds to a context descriptor used for the + * next_to_clean packet (TSO packet) or the "first" buffer (single + * packet). The orig_idx from the packet we just cleaned will always + * point to the "first" buffer. If next_to_clean points to a reserved + * buffer, let's bump ntc once and start the comparison from there. */ + ntc = txq->next_to_clean; tx_buf = &txq->tx_buf[ntc]; - while (tx_buf != &txq->tx_buf[orig_idx]) { - idpf_stash_flow_sch_buffers(txq, tx_buf); + + if (tx_buf->type == LIBETH_SQE_CTX) idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); - } - /* Finally, update next_to_clean to reflect the work that was just done - * on the ring, if any. If the packet was only cleaned from the hash - * table, the ring will not be impacted, therefore we should not touch - * next_to_clean. The updated idx is used here + /* + * If ntc still points to a different "first" buffer, clean the + * descriptor ring and stash all of the buffers for later cleaning. If + * we cannot stash all of the buffers, next_to_clean will point to the + * "first" buffer of the packet that could not be stashed and cleaning + * will start there next time. + */ + if (unlikely(tx_buf != &txq->tx_buf[orig_idx] && + !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned, + true))) + return true; + + /* + * Otherwise, update next_to_clean to reflect the cleaning that was + * done above. */ txq->next_to_clean = idx; @@ -1906,7 +1944,8 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, if (!idpf_queue_has(FLOW_SCH_EN, txq)) { u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - return idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + return; } compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); @@ -2306,6 +2345,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, dma = dma_map_single(tx_q->dev, skb->data, size, DMA_TO_DEVICE); tx_buf = first; + first->nr_frags = 0; params->compl_tag = (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; @@ -2316,6 +2356,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (dma_mapping_error(tx_q->dev, dma)) return idpf_tx_dma_map_error(tx_q, skb, first, i); + first->nr_frags++; idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; tx_buf->type = LIBETH_SQE_FRAG; @@ -2371,14 +2412,15 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, max_data); - tx_desc++; - i++; - - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); + } else { + tx_buf++; + tx_desc++; } /* Since this packet has a buffer that is going to span @@ -2391,7 +2433,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, * simply pass over these holes and finish cleaning the * rest of the packet. */ - tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; + tx_buf->type = LIBETH_SQE_EMPTY; /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, @@ -2415,13 +2457,15 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, break; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); - tx_desc++; - i++; - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); + } else { + tx_buf++; + tx_desc++; } size = skb_frag_size(frag); @@ -2429,8 +2473,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, dma = skb_frag_dma_map(tx_q->dev, frag, 0, size, DMA_TO_DEVICE); - - tx_buf = &tx_q->tx_buf[i]; } /* record SW timestamp if HW timestamp is not available */ diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 9ead6a55f9cbc..4d4086aad6203 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -133,7 +133,7 @@ do { \ */ #define IDPF_TX_COMPLQ_PENDING(txq) \ (((txq)->num_completions_pending >= (txq)->complq->num_completions ? \ - 0 : U64_MAX) + \ + 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) #define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 @@ -847,7 +847,7 @@ struct idpf_compl_queue { struct net_device *netdev; u32 clean_budget; - u32 num_completions; + aligned_u64 num_completions; /* Slowpath */ u32 q_id; @@ -968,7 +968,7 @@ struct idpf_txq_group { struct idpf_compl_queue *complq; - u32 num_completions_pending; + aligned_u64 num_completions_pending; }; /** From ccf488a3ffefb46dcf172a2c154b22b5f33bd767 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Wed, 25 Feb 2026 15:46:51 +0100 Subject: [PATCH 44/50] idpf: set completion tag for "empty" bufs associated with a packet jira KERNEL-168 commit-author Joshua Hay commit 4c69c77aafe74cf755af55070584b643e5c4e4d8 Commit d9028db618a6 ("idpf: convert to libeth Tx buffer completion") inadvertently removed code that was necessary for the tx buffer cleaning routine to iterate over all buffers associated with a packet. When a frag is too large for a single data descriptor, it will be split across multiple data descriptors. This means the frag will span multiple buffers in the buffer ring in order to keep the descriptor and buffer ring indexes aligned. The buffer entries in the ring are technically empty and no cleaning actions need to be performed. These empty buffers can precede other frags associated with the same packet. I.e. a single packet on the buffer ring can look like: buf[0]=skb0.frag0 buf[1]=skb0.frag1 buf[2]=empty buf[3]=skb0.frag2 The cleaning routine iterates through these buffers based on a matching completion tag. If the completion tag is not set for buf2, the loop will end prematurely. Frag2 will be left uncleaned and next_to_clean will be left pointing to the end of packet, which will break the cleaning logic for subsequent cleans. This consequently leads to tx timeouts. Assign the empty bufs the same completion tag for the packet to ensure the cleaning routine iterates over all of the buffers associated with the packet. Fixes: d9028db618a6 ("idpf: convert to libeth Tx buffer completion") Signed-off-by: Joshua Hay Acked-by: Alexander Lobakin Reviewed-by: Madhu chittim Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen (cherry picked from commit 4c69c77aafe74cf755af55070584b643e5c4e4d8) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index e5ea2e40d9215..5c2824200cb70 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2434,6 +2434,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, * rest of the packet. */ tx_buf->type = LIBETH_SQE_EMPTY; + idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, From 17d7d8f3839ece41729ff7d76422f29591a4be5e Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 23 Feb 2026 15:16:09 +0100 Subject: [PATCH 45/50] idpf: add support for Tx refillqs in flow scheduling mode jira KERNEL-168 commit-author Joshua Hay commit cb83b559bea39f207ee214ee2972657e8576ed18 upstream-diff | 1. adjusted context around idpf_rx_splitq_clean function due to missing - 74d1412ac8f3 ("idpf: use libeth Rx buffer management for payload buffer") - 6ad5ff6e7282 ("libeth: convert to netmem") 2. adjusted context around struct idpf_tx_queue members and docstring and did not include the libeth_cacheline_set_assert changes due to missing: - 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") 3. fix compilation issue (std=89) in idpf_tx_desc_alloc due to for loop var declaration In certain production environments, it is possible for completion tags to collide, meaning N packets with the same completion tag are in flight at the same time. In this environment, any given Tx queue is effectively used to send both slower traffic and higher throughput traffic simultaneously. This is the result of a customer's specific configuration in the device pipeline, the details of which Intel cannot provide. This configuration results in a small number of out-of-order completions, i.e., a small number of packets in flight. The existing guardrails in the driver only protect against a large number of packets in flight. The slower flow completions are delayed which causes the out-of-order completions. The fast flow will continue sending traffic and generating tags. Because tags are generated on the fly, the fast flow eventually uses the same tag for a packet that is still in flight from the slower flow. The driver has no idea which packet it should clean when it processes the completion with that tag, but it will look for the packet on the buffer ring before the hash table. If the slower flow packet completion is processed first, it will end up cleaning the fast flow packet on the ring prematurely. This leaves the descriptor ring in a bad state resulting in a crash or Tx timeout. In summary, generating a tag when a packet is sent can lead to the same tag being associated with multiple packets. This can lead to resource leaks, crashes, and/or Tx timeouts. Before we can replace the tag generation, we need a new mechanism for the send path to know what tag to use next. The driver will allocate and initialize a refillq for each TxQ with all of the possible free tag values. During send, the driver grabs the next free tag from the refillq from next_to_clean. While cleaning the packet, the clean routine posts the tag back to the refillq's next_to_use to indicate that it is now free to use. This mechanism works exactly the same way as the existing Rx refill queues, which post the cleaned buffer IDs back to the buffer queue to be reposted to HW. Since we're using the refillqs for both Rx and Tx now, genericize some of the existing refillq support. Note: the refillqs will not be used yet. This is only demonstrating how they will be used to pass free tags back to the send path. Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit cb83b559bea39f207ee214ee2972657e8576ed18) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 94 +++++++++++++++++++-- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 +- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 5c2824200cb70..eff094d2f5059 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -137,6 +137,9 @@ static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) if (!txq->desc_ring) return; + if (txq->refillq) + kfree(txq->refillq->ring); + dmam_free_coherent(txq->dev, txq->size, txq->desc_ring, txq->dma); txq->desc_ring = NULL; txq->next_to_use = 0; @@ -242,7 +245,9 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, struct idpf_tx_queue *tx_q) { struct device *dev = tx_q->dev; + struct idpf_sw_queue *refillq; int err; + unsigned int i = 0; err = idpf_tx_buf_alloc_all(tx_q); if (err) @@ -265,6 +270,29 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, tx_q->next_to_clean = 0; idpf_queue_set(GEN_CHK, tx_q); + if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) + return 0; + + refillq = tx_q->refillq; + refillq->desc_count = tx_q->desc_count; + refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), + GFP_KERNEL); + if (!refillq->ring) { + err = -ENOMEM; + goto err_alloc; + } + + for (i = 0; i < refillq->desc_count; i++) + refillq->ring[i] = + FIELD_PREP(IDPF_RFL_BI_BUFID_M, i) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, + idpf_queue_has(GEN_CHK, refillq)); + + /* Go ahead and flip the GEN bit since this counts as filling + * up the ring, i.e. we already ring wrapped. + */ + idpf_queue_change(GEN_CHK, refillq); + return 0; err_alloc: @@ -586,18 +614,18 @@ static int idpf_rx_hdr_buf_alloc_all(struct idpf_buf_queue *bufq) } /** - * idpf_rx_post_buf_refill - Post buffer id to refill queue + * idpf_post_buf_refill - Post buffer id to refill queue * @refillq: refill queue to post to * @buf_id: buffer id to post */ -static void idpf_rx_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) +static void idpf_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) { u32 nta = refillq->next_to_use; /* store the buffer ID and the SW maintained GEN bit to the refillq */ refillq->ring[nta] = - FIELD_PREP(IDPF_RX_BI_BUFID_M, buf_id) | - FIELD_PREP(IDPF_RX_BI_GEN_M, + FIELD_PREP(IDPF_RFL_BI_BUFID_M, buf_id) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, idpf_queue_has(GEN_CHK, refillq)); if (unlikely(++nta == refillq->desc_count)) { @@ -977,6 +1005,11 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) struct idpf_txq_group *txq_grp = &vport->txq_grps[i]; for (j = 0; j < txq_grp->num_txq; j++) { + if (flow_sch_en) { + kfree(txq_grp->txqs[j]->refillq); + txq_grp->txqs[j]->refillq = NULL; + } + kfree(txq_grp->txqs[j]); txq_grp->txqs[j] = NULL; } @@ -1394,6 +1427,13 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) } idpf_queue_set(FLOW_SCH_EN, q); + + q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); + if (!q->refillq) + goto err_alloc; + + idpf_queue_set(GEN_CHK, q->refillq); + idpf_queue_set(RFL_GEN_CHK, q->refillq); } if (!split) @@ -1950,6 +1990,8 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); + idpf_post_buf_refill(txq->refillq, compl_tag); + /* If we didn't clean anything on the ring, this packet must be * in the hash table. Go clean it there. */ @@ -2309,6 +2351,37 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) return ntu; } +/** + * idpf_tx_get_free_buf_id - get a free buffer ID from the refill queue + * @refillq: refill queue to get buffer ID from + * @buf_id: return buffer ID + * + * Return: true if a buffer ID was found, false if not + */ +static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, + u16 *buf_id) +{ + u32 ntc = refillq->next_to_clean; + u32 refill_desc; + + refill_desc = refillq->ring[ntc]; + + if (unlikely(idpf_queue_has(RFL_GEN_CHK, refillq) != + !!(refill_desc & IDPF_RFL_BI_GEN_M))) + return false; + + *buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); + + if (unlikely(++ntc == refillq->desc_count)) { + idpf_queue_change(RFL_GEN_CHK, refillq); + ntc = 0; + } + + refillq->next_to_clean = ntc; + + return true; +} + /** * idpf_tx_splitq_map - Build the Tx flex descriptor * @tx_q: queue to send buffer on @@ -2784,6 +2857,13 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, } if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &tx_params.compl_tag))) { + u64_stats_update_begin(&tx_q->stats_sync); + u64_stats_inc(&tx_q->q_stats.q_busy); + u64_stats_update_end(&tx_q->stats_sync); + } + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; /* Set the RE bit to catch any packets that may have not been @@ -3360,7 +3440,7 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) if (!skb) break; - idpf_rx_post_buf_refill(refillq, buf_id); + idpf_post_buf_refill(refillq, buf_id); IDPF_RX_BUMP_NTC(rxq, ntc); /* skip if it is non EOP desc */ @@ -3458,10 +3538,10 @@ static void idpf_rx_clean_refillq(struct idpf_buf_queue *bufq, bool failure; if (idpf_queue_has(RFL_GEN_CHK, refillq) != - !!(refill_desc & IDPF_RX_BI_GEN_M)) + !!(refill_desc & IDPF_RFL_BI_GEN_M)) break; - buf_id = FIELD_GET(IDPF_RX_BI_BUFID_M, refill_desc); + buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); failure = idpf_rx_update_bufq_desc(bufq, buf_id, buf_desc); if (failure) break; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 4d4086aad6203..647093b2d3dd4 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -113,8 +113,8 @@ do { \ */ #define IDPF_TX_SPLITQ_RE_MIN_GAP 64 -#define IDPF_RX_BI_GEN_M BIT(16) -#define IDPF_RX_BI_BUFID_M GENMASK(15, 0) +#define IDPF_RFL_BI_GEN_M BIT(16) +#define IDPF_RFL_BI_BUFID_M GENMASK(15, 0) #define IDPF_RXD_EOF_SPLITQ VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_EOF_M #define IDPF_RXD_EOF_SINGLEQ VIRTCHNL2_RX_BASE_DESC_STATUS_EOF_M @@ -685,6 +685,7 @@ struct idpf_rx_queue { * @cleaned_pkts: Number of packets cleaned for the above said case * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @tx_min_pkt_len: Min supported packet length + * @refillq: Pointer to refill queue * @compl_tag_bufid_m: Completion tag buffer id mask * @compl_tag_gen_s: Completion tag generation bit * The format of the completion tag will change based on the TXQ @@ -746,6 +747,7 @@ struct idpf_tx_queue { u16 tx_max_bufs; u16 tx_min_pkt_len; + struct idpf_sw_queue *refillq; u16 compl_tag_bufid_m; u16 compl_tag_gen_s; From 94a1bed6751ab542e8db1f882db9ae392f0099c4 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 23 Feb 2026 15:55:30 +0100 Subject: [PATCH 46/50] idpf: improve when to set RE bit logic jira KERNEL-168 commit-author Joshua Hay commit f2d18e16479cac7a708d77cbfb4220a9114a71fc upstream-diff | adjusted context in struct idpf_tx_queue because the order of the fields is different due to missing - 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") Track the gap between next_to_use and the last RE index. Set RE again if the gap is large enough to ensure RE bit is set frequently. This is critical before removing the stashing mechanisms because the opportunistic descriptor ring cleaning from the out-of-order completions will go away. Previously the descriptors would be "cleaned" by both the descriptor (RE) completion and the out-of-order completions. Without the latter, we must ensure the RE bit is set more frequently. Otherwise, it's theoretically possible for the descriptor ring next_to_clean to never advance. The previous implementation was dependent on the start of a packet falling on a 64th index in the descriptor ring, which is not guaranteed with large packets. Signed-off-by: Luigi Rizzo Signed-off-by: Brian Vazquez Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit f2d18e16479cac7a708d77cbfb4220a9114a71fc) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 20 +++++++++++++++++++- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 ++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index eff094d2f5059..8493070cbb5f7 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -293,6 +293,8 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, */ idpf_queue_change(GEN_CHK, refillq); + tx_q->last_re = tx_q->desc_count - IDPF_TX_SPLITQ_RE_MIN_GAP; + return 0; err_alloc: @@ -2791,6 +2793,21 @@ netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb) return NETDEV_TX_OK; } +/** + * idpf_tx_splitq_need_re - check whether RE bit needs to be set + * @tx_q: pointer to Tx queue + * + * Return: true if RE bit needs to be set, false otherwise + */ +static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) +{ + int gap = tx_q->next_to_use - tx_q->last_re; + + gap += (gap < 0) ? tx_q->desc_count : 0; + + return gap >= IDPF_TX_SPLITQ_RE_MIN_GAP; +} + /** * idpf_tx_splitq_frame - Sends buffer on Tx ring using flex descriptors * @skb: send buffer @@ -2871,9 +2888,10 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, * MIN_RING size to ensure it will be set at least once each * time around the ring. */ - if (!(tx_q->next_to_use % IDPF_TX_SPLITQ_RE_MIN_GAP)) { + if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; tx_q->txq_grp->num_completions_pending++; + tx_q->last_re = tx_q->next_to_use; } if (skb->ip_summed == CHECKSUM_PARTIAL) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 647093b2d3dd4..35c8f5c4de14a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -672,6 +672,8 @@ struct idpf_rx_queue { * @desc_count: Number of descriptors * @next_to_use: Next descriptor to use * @next_to_clean: Next descriptor to clean + * @last_re: last descriptor index that RE bit was set + * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @netdev: &net_device corresponding to this queue * @cleaned_bytes: Splitq only, TXQ only: When a TX completion is received on * the TX completion queue, it can be for any TXQ associated @@ -683,7 +685,6 @@ struct idpf_rx_queue { * only once at the end of the cleaning routine. * @clean_budget: singleq only, queue cleaning budget * @cleaned_pkts: Number of packets cleaned for the above said case - * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @tx_min_pkt_len: Min supported packet length * @refillq: Pointer to refill queue * @compl_tag_bufid_m: Completion tag buffer id mask @@ -736,6 +737,8 @@ struct idpf_tx_queue { u16 desc_count; u16 next_to_use; u16 next_to_clean; + u16 last_re; + u16 tx_max_bufs; struct net_device *netdev; @@ -745,7 +748,6 @@ struct idpf_tx_queue { }; u16 cleaned_pkts; - u16 tx_max_bufs; u16 tx_min_pkt_len; struct idpf_sw_queue *refillq; From d6dd009881099e301df590c61e8e39dd4dfe71e5 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:20 -0700 Subject: [PATCH 47/50] idpf: simplify and fix splitq Tx packet rollback error path jira KERNEL-168 commit-author Joshua Hay commit b61dfa9bc4430ad82b96d3a7c1c485350f91b467 upstream-diff | adjusted context in 2 places: - when removing func idpf_tx_dma_map_error due to different memset call that uses the hardcoded struct type; - in func idpf_tx_splitq_frame due to missing expected union idpf_flex_tx_ctx_desc *ctx_desc; both differences were introduced in commit 1a49cf814fe1e ("idpf: add Tx timestamp flows"). Move (and rename) the existing rollback logic to singleq.c since that will be the only consumer. Create a simplified splitq specific rollback function to loop through and unmap tx_bufs based on the completion tag. This is critical before replacing the Tx buffer ring with the buffer pool since the previous rollback indexing will not work to unmap the chained buffers from the pool. Cache the next_to_use index before any portion of the packet is put on the descriptor ring. In case of an error, the rollback will bump tail to the correct next_to_use value. Because the splitq path now supports different types of context descriptors (and potentially multiple in the future), this will take care of rolling back any and all context descriptors encoded on the ring for the erroneous packet. The previous rollback logic was broken for PTP packets since it would not account for the PTP context descriptor. Fixes: 1a49cf814fe1 ("idpf: add Tx timestamp flows") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit b61dfa9bc4430ad82b96d3a7c1c485350f91b467) Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 57 +++++++++++- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 91 ++++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 5 +- 3 files changed, 95 insertions(+), 58 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 762c8af258522..0163d4488ff05 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -178,6 +178,58 @@ static int idpf_tx_singleq_csum(struct sk_buff *skb, return 1; } +/** + * idpf_tx_singleq_dma_map_error - handle TX DMA map errors + * @txq: queue to send buffer on + * @skb: send buffer + * @first: original first buffer info buffer for packet + * @idx: starting point on ring to unwind + */ +static void idpf_tx_singleq_dma_map_error(struct idpf_tx_queue *txq, + struct sk_buff *skb, + struct idpf_tx_buf *first, u16 idx) +{ + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + /* clear dma mappings for failed tx_buf map */ + for (;;) { + struct idpf_tx_buf *tx_buf; + + tx_buf = &txq->tx_buf[idx]; + libeth_tx_complete(tx_buf, &cp); + if (tx_buf == first) + break; + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + if (skb_is_gso(skb)) { + union idpf_tx_flex_desc *tx_desc; + + /* If we failed a DMA mapping for a TSO packet, we will have + * used one additional descriptor for a context + * descriptor. Reset that here. + */ + tx_desc = &txq->flex_tx[idx]; + memset(tx_desc, 0, sizeof(*tx_desc)); + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + /* Update tail in case netdev_xmit_more was previously true */ + idpf_tx_buf_hw_update(txq, idx, false); +} + /** * idpf_tx_singleq_map - Build the Tx base descriptor * @tx_q: queue to send buffer on @@ -218,8 +270,9 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_singleq_dma_map_error(tx_q, skb, + first, i); /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 8493070cbb5f7..7eba4f151bdbf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2285,57 +2285,6 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; } -/** - * idpf_tx_dma_map_error - handle TX DMA map errors - * @txq: queue to send buffer on - * @skb: send buffer - * @first: original first buffer info buffer for packet - * @idx: starting point on ring to unwind - */ -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 idx) -{ - struct libeth_sq_napi_stats ss = { }; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = &ss, - }; - - u64_stats_update_begin(&txq->stats_sync); - u64_stats_inc(&txq->q_stats.dma_map_errs); - u64_stats_update_end(&txq->stats_sync); - - /* clear dma mappings for failed tx_buf map */ - for (;;) { - struct idpf_tx_buf *tx_buf; - - tx_buf = &txq->tx_buf[idx]; - libeth_tx_complete(tx_buf, &cp); - if (tx_buf == first) - break; - if (idx == 0) - idx = txq->desc_count; - idx--; - } - - if (skb_is_gso(skb)) { - union idpf_tx_flex_desc *tx_desc; - - /* If we failed a DMA mapping for a TSO packet, we will have - * used one additional descriptor for a context - * descriptor. Reset that here. - */ - tx_desc = &txq->flex_tx[idx]; - memset(tx_desc, 0, sizeof(struct idpf_flex_tx_ctx_desc)); - if (idx == 0) - idx = txq->desc_count; - idx--; - } - - /* Update tail in case netdev_xmit_more was previously true */ - idpf_tx_buf_hw_update(txq, idx, false); -} - /** * idpf_tx_splitq_bump_ntu - adjust NTU and generation * @txq: the tx ring to wrap @@ -2384,6 +2333,37 @@ static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, return true; } +/** + * idpf_tx_splitq_pkt_err_unmap - Unmap buffers and bump tail in case of error + * @txq: Tx queue to unwind + * @params: pointer to splitq params struct + * @first: starting buffer for packet to unmap + */ +static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, + struct idpf_tx_splitq_params *params, + struct idpf_tx_buf *first) +{ + struct libeth_sq_napi_stats ss = { }; + struct idpf_tx_buf *tx_buf = first; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + u32 idx = 0; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + do { + libeth_tx_complete(tx_buf, &cp); + idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + } while (idpf_tx_buf_compl_tag(tx_buf) == params->compl_tag); + + /* Update tail in case netdev_xmit_more was previously true. */ + idpf_tx_buf_hw_update(txq, params->prev_ntu, false); +} + /** * idpf_tx_splitq_map - Build the Tx flex descriptor * @tx_q: queue to send buffer on @@ -2428,8 +2408,9 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); first->nr_frags++; idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; @@ -2818,7 +2799,9 @@ static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q) { - struct idpf_tx_splitq_params tx_params = { }; + struct idpf_tx_splitq_params tx_params = { + .prev_ntu = tx_q->next_to_use, + }; struct idpf_tx_buf *first; unsigned int count; int tso; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 35c8f5c4de14a..e3f9183232268 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -200,6 +200,7 @@ struct idpf_tx_offload_params { * @compl_tag: Associated tag for completion * @td_tag: Descriptor tunneling tag * @offload: Offload parameters + * @prev_ntu: stored TxQ next_to_use in case of rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -210,6 +211,8 @@ struct idpf_tx_splitq_params { }; struct idpf_tx_offload_params offload; + + u16 prev_ntu; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -1131,8 +1134,6 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 ring_idx); unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, struct sk_buff *skb); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); From e6d108730774ce4d215b9d11172d44fe2e9bbbcb Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:21 -0700 Subject: [PATCH 48/50] idpf: replace flow scheduling buffer ring with buffer pool jira KERNEL-168 commit-author Joshua Hay commit 5f417d551324d2894168b362f2429d120ab06243 upstream-diff | adjusted context in: - ifpf_tx_splitq_frame and idpf_tx_clean_bufs; - struct idpf_tx_queue due to missing of some elements in the struct; both due to missing commit - 1a49cf814fe1e ("idpf: add Tx timestamp flows"). and did not include the cacheline assert changes due to missing - 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") Replace the TxQ buffer ring with one large pool/array of buffers (only for flow scheduling). This eliminates the tag generation and makes it impossible for a tag to be associated with more than one packet. The completion tag passed to HW through the descriptor is the index into the array. That same completion tag is posted back to the driver in the completion descriptor, and used to index into the array to quickly retrieve the buffer during cleaning. In this way, the tags are treated as a fix sized resource. If all tags are in use, no more packets can be sent on that particular queue (until some are freed up). The tag pool size is 64K since the completion tag width is 16 bits. For each packet, the driver pulls a free tag from the refillq to get the next free buffer index. When cleaning is complete, the tag is posted back to the refillq. A multi-frag packet spans multiple buffers in the driver, therefore it uses multiple buffer indexes/tags from the pool. Each frag pulls from the refillq to get the next free buffer index. These are tracked in a next_buf field that replaces the completion tag field in the buffer struct. This chains the buffers together so that the packet can be cleaned from the starting completion tag taken from the completion descriptor, then from the next_buf field for each subsequent buffer. In case of a dma_mapping_error occurs or the refillq runs out of free buf_ids, the packet will execute the rollback error path. This unmaps any buffers previously mapped for the packet. Since several free buf_ids could have already been pulled from the refillq, we need to restore its original state as well. Otherwise, the buf_ids/tags will be leaked and not used again until the queue is reallocated. Descriptor completions only advance the descriptor ring index to "clean" the descriptors. The packet completions only clean the buffers associated with the given packet completion tag and do not update the descriptor ring index. When operating in queue based scheduling mode, the array still acts as a ring and will only have TxQ descriptor count entries. The tx_bufs are still associated 1:1 with the descriptor ring entries and we can use the conventional indexing mechanisms. Fixes: c2d548cad150 ("idpf: add TX splitq napi poll support") Signed-off-by: Luigi Rizzo Signed-off-by: Brian Vazquez Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit 5f417d551324d2894168b362f2429d120ab06243) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 207 +++++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 + 2 files changed, 104 insertions(+), 111 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 7eba4f151bdbf..ff0693c8e1e4b 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -11,6 +11,7 @@ struct idpf_tx_stash { struct libeth_sqe buf; }; +#define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) #define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); @@ -89,7 +90,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) return; /* Free all the Tx buffer sk_buffs */ - for (i = 0; i < txq->desc_count; i++) + for (i = 0; i < txq->buf_pool_size; i++) libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); @@ -197,14 +198,17 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { struct idpf_buf_lifo *buf_stack; - int buf_size; int i; /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ - buf_size = sizeof(struct idpf_tx_buf) * tx_q->desc_count; - tx_q->tx_buf = kzalloc(buf_size, GFP_KERNEL); + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) + tx_q->buf_pool_size = U16_MAX; + else + tx_q->buf_pool_size = tx_q->desc_count; + tx_q->tx_buf = kcalloc(tx_q->buf_pool_size, sizeof(*tx_q->tx_buf), + GFP_KERNEL); if (!tx_q->tx_buf) return -ENOMEM; @@ -274,7 +278,7 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, return 0; refillq = tx_q->refillq; - refillq->desc_count = tx_q->desc_count; + refillq->desc_count = tx_q->buf_pool_size; refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), GFP_KERNEL); if (!refillq->ring) { @@ -1819,6 +1823,12 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, struct idpf_tx_buf *tx_buf; bool clean_complete = true; + if (descs_only) { + /* Bump ring index to mark as cleaned. */ + tx_q->next_to_clean = end; + return true; + } + tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; tx_buf = &tx_q->tx_buf[ntc]; @@ -1885,83 +1895,40 @@ do { \ } while (0) /** - * idpf_tx_clean_buf_ring - clean flow scheduling TX queue buffers + * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) + * @buf_id: packet's starting buffer ID, from completion descriptor * @cleaned: pointer to stats struct to track cleaned packets/bytes * @budget: Used to determine if we are in netpoll * - * Cleans all buffers associated with the input completion tag either from the - * TX buffer ring or from the hash table if the buffers were previously - * stashed. Returns the byte/segment count for the cleaned packet associated - * this completion tag. + * Clean all buffers associated with the packet starting at buf_id. Returns the + * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) +static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, + struct libeth_sq_napi_stats *cleaned, + int budget) { - u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = cleaned, .napi = budget, }; - u16 ntc, orig_idx = idx; - - tx_buf = &txq->tx_buf[idx]; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX || - idpf_tx_buf_compl_tag(tx_buf) != compl_tag)) - return false; - if (tx_buf->type == LIBETH_SQE_SKB) + tx_buf = &txq->tx_buf[buf_id]; + if (tx_buf->type == LIBETH_SQE_SKB) { libeth_tx_complete(tx_buf, &cp); + idpf_post_buf_refill(txq->refillq, buf_id); + } - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + buf_id = idpf_tx_buf_next(tx_buf); - while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) { + tx_buf = &txq->tx_buf[buf_id]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + idpf_post_buf_refill(txq->refillq, buf_id); } - /* - * It's possible the packet we just cleaned was an out of order - * completion, which means we can stash the buffers starting from - * the original next_to_clean and reuse the descriptors. We need - * to compare the descriptor ring next_to_clean packet's "first" buffer - * to the "first" buffer of the packet we just cleaned to determine if - * this is the case. Howevever, next_to_clean can point to either a - * reserved buffer that corresponds to a context descriptor used for the - * next_to_clean packet (TSO packet) or the "first" buffer (single - * packet). The orig_idx from the packet we just cleaned will always - * point to the "first" buffer. If next_to_clean points to a reserved - * buffer, let's bump ntc once and start the comparison from there. - */ - ntc = txq->next_to_clean; - tx_buf = &txq->tx_buf[ntc]; - - if (tx_buf->type == LIBETH_SQE_CTX) - idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); - - /* - * If ntc still points to a different "first" buffer, clean the - * descriptor ring and stash all of the buffers for later cleaning. If - * we cannot stash all of the buffers, next_to_clean will point to the - * "first" buffer of the packet that could not be stashed and cleaning - * will start there next time. - */ - if (unlikely(tx_buf != &txq->tx_buf[orig_idx] && - !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned, - true))) - return true; - - /* - * Otherwise, update next_to_clean to reflect the cleaning that was - * done above. - */ - txq->next_to_clean = idx; - return true; } @@ -1992,12 +1959,10 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - idpf_post_buf_refill(txq->refillq, compl_tag); - /* If we didn't clean anything on the ring, this packet must be * in the hash table. Go clean it there. */ - if (!idpf_tx_clean_buf_ring(txq, compl_tag, cleaned, budget)) + if (!idpf_tx_clean_bufs(txq, compl_tag, cleaned, budget)) idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); } @@ -2310,7 +2275,7 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) * Return: true if a buffer ID was found, false if not */ static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, - u16 *buf_id) + u32 *buf_id) { u32 ntc = refillq->next_to_clean; u32 refill_desc; @@ -2343,25 +2308,34 @@ static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, struct idpf_tx_splitq_params *params, struct idpf_tx_buf *first) { + struct idpf_sw_queue *refillq = txq->refillq; struct libeth_sq_napi_stats ss = { }; struct idpf_tx_buf *tx_buf = first; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - u32 idx = 0; u64_stats_update_begin(&txq->stats_sync); u64_stats_inc(&txq->q_stats.dma_map_errs); u64_stats_update_end(&txq->stats_sync); - do { + libeth_tx_complete(tx_buf, &cp); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + tx_buf = &txq->tx_buf[idpf_tx_buf_next(tx_buf)]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); - } while (idpf_tx_buf_compl_tag(tx_buf) == params->compl_tag); + } /* Update tail in case netdev_xmit_more was previously true. */ idpf_tx_buf_hw_update(txq, params->prev_ntu, false); + + if (!refillq) + return; + + /* Restore refillq state to avoid leaking tags. */ + if (params->prev_refill_gen != idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = params->prev_refill_ntc; } /** @@ -2385,6 +2359,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, struct netdev_queue *nq; struct sk_buff *skb; skb_frag_t *frag; + u32 next_buf_id; u16 td_cmd = 0; dma_addr_t dma; @@ -2402,18 +2377,16 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, tx_buf = first; first->nr_frags = 0; - params->compl_tag = - (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; - for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (unlikely(dma_mapping_error(tx_q->dev, dma))) + if (unlikely(dma_mapping_error(tx_q->dev, dma))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; return idpf_tx_splitq_pkt_err_unmap(tx_q, params, first); + } first->nr_frags++; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ @@ -2469,29 +2442,14 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, max_data); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } - /* Since this packet has a buffer that is going to span - * multiple descriptors, it's going to leave holes in - * to the TX buffer ring. To ensure these holes do not - * cause issues in the cleaning routines, we will clear - * them of any stale data and assign them the same - * completion tag as the current packet. Then when the - * packet is being cleaned, the cleaning routines will - * simply pass over these holes and finish cleaning the - * rest of the packet. - */ - tx_buf->type = LIBETH_SQE_EMPTY; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; - /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, * max_data will be >= 12K and <= 16K-1. On any @@ -2516,15 +2474,26 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &next_buf_id))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } + } else { + next_buf_id = i; + } + idpf_tx_buf_next(tx_buf) = next_buf_id; + tx_buf = &tx_q->tx_buf[next_buf_id]; + size = skb_frag_size(frag); data_len -= size; @@ -2539,6 +2508,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, /* write last descriptor with RS and EOP bits */ first->rs_idx = i; + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); @@ -2747,8 +2717,6 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - txq->tx_buf[i].type = LIBETH_SQE_CTX; - /* grab the next descriptor */ desc = &txq->flex_ctx[i]; txq->next_to_use = idpf_tx_splitq_bump_ntu(txq, i); @@ -2805,6 +2773,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_buf *first; unsigned int count; int tso; + u32 buf_id; count = idpf_tx_desc_count_required(tx_q, skb); if (unlikely(!count)) @@ -2843,26 +2812,28 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, u64_stats_update_end(&tx_q->stats_sync); } - /* record the location of the first descriptor for this packet */ - first = &tx_q->tx_buf[tx_q->next_to_use]; - first->skb = skb; + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + struct idpf_sw_queue *refillq = tx_q->refillq; - if (tso) { - first->packets = tx_params.offload.tso_segs; - first->bytes = skb->len + - ((first->packets - 1) * tx_params.offload.tso_hdr_len); - } else { - first->packets = 1; - first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); - } + /* Save refillq state in case of a packet rollback. Otherwise, + * the tags will be leaked since they will be popped from the + * refillq but never reposted during cleaning. + */ + tx_params.prev_refill_gen = + idpf_queue_has(RFL_GEN_CHK, refillq); + tx_params.prev_refill_ntc = refillq->next_to_clean; - if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, - &tx_params.compl_tag))) { - u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_inc(&tx_q->q_stats.q_busy); - u64_stats_update_end(&tx_q->stats_sync); + &buf_id))) { + if (tx_params.prev_refill_gen != + idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = tx_params.prev_refill_ntc; + + tx_q->next_to_use = tx_params.prev_ntu; + return idpf_tx_drop_skb(tx_q, skb); } + tx_params.compl_tag = buf_id; tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; @@ -2881,6 +2852,8 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TXD_FLEX_FLOW_CMD_CS_EN; } else { + buf_id = tx_q->next_to_use; + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2; tx_params.eop_cmd = IDPF_TXD_LAST_DESC_CMD; @@ -2888,6 +2861,18 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TX_FLEX_DESC_CMD_CS_EN; } + first = &tx_q->tx_buf[buf_id]; + first->skb = skb; + + if (tso) { + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); + } else { + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + } + idpf_tx_splitq_map(tx_q, &tx_params, first); return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index e3f9183232268..5ee0cac22baaf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -142,6 +142,8 @@ do { \ ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ 0 : (txq)->compl_tag_cur_gen) +#define IDPF_TXBUF_NULL U32_MAX + #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) #define IDPF_TX_FLAGS_TSO BIT(0) @@ -201,6 +203,8 @@ struct idpf_tx_offload_params { * @td_tag: Descriptor tunneling tag * @offload: Offload parameters * @prev_ntu: stored TxQ next_to_use in case of rollback + * @prev_refill_ntc: stored refillq next_to_clean in case of packet rollback + * @prev_refill_gen: stored refillq generation bit in case of packet rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -213,6 +217,8 @@ struct idpf_tx_splitq_params { struct idpf_tx_offload_params offload; u16 prev_ntu; + u16 prev_refill_ntc; + bool prev_refill_gen; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -720,6 +726,7 @@ struct idpf_rx_queue { * @size: Length of descriptor ring in bytes * @dma: Physical address of ring * @q_vector: Backreference to associated vector + * @buf_pool_size: Total number of idpf_tx_buf */ struct idpf_tx_queue { union { @@ -771,6 +778,7 @@ struct idpf_tx_queue { dma_addr_t dma; struct idpf_q_vector *q_vector; + u32 buf_pool_size; } ____cacheline_aligned; /** From f1a4e2b1ce9ac0ca977dd5d867af5f30aa3d0e8e Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:22 -0700 Subject: [PATCH 49/50] idpf: stop Tx if there are insufficient buffer resources jira KERNEL-168 commit-author Joshua Hay commit 0c3f135e840d4a2ba4253e15d530ec61bc30718e upstream-diff | adjusted conflict in idpf_tx_splitq_frame func due to missing 1a49cf814fe1e ("idpf: add Tx timestamp flows"). The Tx refillq logic will cause packets to be silently dropped if there are not enough buffer resources available to send a packet in flow scheduling mode. Instead, determine how many buffers are needed along with number of descriptors. Make sure there are enough of both resources to send the packet, and stop the queue if not. Fixes: 7292af042bcf ("idpf: fix a race in txq wakeup") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit 0c3f135e840d4a2ba4253e15d530ec61bc30718e) Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 4 +- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 47 +++++++++++++------ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 15 +++++- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 0163d4488ff05..e761f12d8901d 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -414,11 +414,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, { struct idpf_tx_offload_params offload = { }; struct idpf_tx_buf *first; + u32 count, buf_count = 1; int csum, tso, needed; - unsigned int count; __be16 protocol; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index ff0693c8e1e4b..2a441e0e39040 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2138,15 +2138,22 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag); } -/* Global conditions to tell whether the txq (and related resources) - * has room to allow the use of "size" descriptors. +/** + * idpf_tx_splitq_has_room - check if enough Tx splitq resources are available + * @tx_q: the queue to be checked + * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of Tx buffers required for this packet + * + * Return: 0 if no room available, 1 otherwise */ -static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) +static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, + u32 bufs_needed) { - if (IDPF_DESC_UNUSED(tx_q) < size || + if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q)) + IDPF_TX_BUF_RSV_LOW(tx_q) || + idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; } @@ -2155,14 +2162,21 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions * @tx_q: the queue to be checked * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of buffers needed for this packet * - * Returns 0 if stop is not needed + * Return: 0 if stop is not needed */ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q, - unsigned int descs_needed) + u32 descs_needed, + u32 bufs_needed) { + /* Since we have multiple resources to check for splitq, our + * start,stop_thrs becomes a boolean check instead of a count + * threshold. + */ if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, - idpf_txq_has_room(tx_q, descs_needed), + idpf_txq_has_room(tx_q, descs_needed, + bufs_needed), 1, 1)) return 0; @@ -2204,14 +2218,16 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, } /** - * idpf_tx_desc_count_required - calculate number of Tx descriptors needed + * idpf_tx_res_count_required - get number of Tx resources needed for this pkt * @txq: queue to send buffer on * @skb: send buffer + * @bufs_needed: (output) number of buffers needed for this skb. * - * Returns number of data descriptors needed for this skb. + * Return: number of data descriptors and buffers needed for this skb. */ -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb) +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, + u32 *bufs_needed) { const struct skb_shared_info *shinfo; unsigned int count = 0, i; @@ -2222,6 +2238,7 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; shinfo = skb_shinfo(skb); + *bufs_needed += shinfo->nr_frags; for (i = 0; i < shinfo->nr_frags; i++) { unsigned int size; @@ -2771,11 +2788,11 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, .prev_ntu = tx_q->next_to_use, }; struct idpf_tx_buf *first; - unsigned int count; + u32 count, buf_count = 1; int tso; u32 buf_id; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); @@ -2785,7 +2802,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, /* Check for splitq specific TX resources */ count += (IDPF_TX_DESCS_PER_CACHE_LINE + tso); - if (idpf_tx_maybe_stop_splitq(tx_q, count)) { + if (idpf_tx_maybe_stop_splitq(tx_q, count, buf_count)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); return NETDEV_TX_BUSY; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 5ee0cac22baaf..cfa6aa5b3bb8f 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -1112,6 +1112,17 @@ static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector) reg->dyn_ctl); } +/** + * idpf_tx_splitq_get_free_bufs - get number of free buf_ids in refillq + * @refillq: pointer to refillq containing buf_ids + */ +static inline u32 idpf_tx_splitq_get_free_bufs(struct idpf_sw_queue *refillq) +{ + return (refillq->next_to_use > refillq->next_to_clean ? + 0 : refillq->desc_count) + + refillq->next_to_use - refillq->next_to_clean - 1; +} + int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget); void idpf_vport_init_num_qs(struct idpf_vport *vport, struct virtchnl2_create_vport *vport_msg); @@ -1142,8 +1153,8 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb); +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, u32 *buf_count); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q); From 27fed19c691f754cc76b61fcb6b990a719ac8b4c Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:23 -0700 Subject: [PATCH 50/50] idpf: remove obsolete stashing code jira KERNEL-168 commit-author Joshua Hay commit 6c4e68480238274f84aa50d54da0d9e262df6284 upstream-diff | - adjusted context in .h due to different order in the struct idpf_tx_queue - adjusted context due to missing idpf_tx_read_tstamp func; both are due to missing 1a49cf814fe1e ("idpf: add Tx timestamp flows"). - did not include libeth_cacheline_set_assert for struct idpf_tx_queue due to missing 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") With the new Tx buffer management scheme, there is no need for all of the stashing mechanisms, the hash table, the reserve buffer stack, etc. Remove all of that. Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit 6c4e68480238274f84aa50d54da0d9e262df6284) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 309 ++------------------ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 47 --- 2 files changed, 21 insertions(+), 335 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 2a441e0e39040..24b255145f57c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -6,48 +6,12 @@ #include "idpf.h" #include "idpf_virtchnl.h" -struct idpf_tx_stash { - struct hlist_node hlist; - struct libeth_sqe buf; -}; - #define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) -#define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); -/** - * idpf_buf_lifo_push - push a buffer pointer onto stack - * @stack: pointer to stack struct - * @buf: pointer to buf to push - * - * Returns 0 on success, negative on failure - **/ -static int idpf_buf_lifo_push(struct idpf_buf_lifo *stack, - struct idpf_tx_stash *buf) -{ - if (unlikely(stack->top == stack->size)) - return -ENOSPC; - - stack->bufs[stack->top++] = buf; - - return 0; -} - -/** - * idpf_buf_lifo_pop - pop a buffer pointer from stack - * @stack: pointer to stack struct - **/ -static struct idpf_tx_stash *idpf_buf_lifo_pop(struct idpf_buf_lifo *stack) -{ - if (unlikely(!stack->top)) - return NULL; - - return stack->bufs[--stack->top]; -} - /** * idpf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure @@ -76,14 +40,11 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { struct libeth_sq_napi_stats ss = { }; - struct idpf_buf_lifo *buf_stack; - struct idpf_tx_stash *stash; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - struct hlist_node *tmp; - u32 i, tag; + u32 i; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) @@ -95,33 +56,6 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) kfree(txq->tx_buf); txq->tx_buf = NULL; - - if (!idpf_queue_has(FLOW_SCH_EN, txq)) - return; - - buf_stack = &txq->stash->buf_stack; - if (!buf_stack->bufs) - return; - - /* - * If a Tx timeout occurred, there are potentially still bufs in the - * hash table, free them here. - */ - hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash, - hlist) { - if (!stash) - continue; - - libeth_tx_complete(&stash->buf, &cp); - hash_del(&stash->hlist); - idpf_buf_lifo_push(buf_stack, stash); - } - - for (i = 0; i < buf_stack->size; i++) - kfree(buf_stack->bufs[i]); - - kfree(buf_stack->bufs); - buf_stack->bufs = NULL; } /** @@ -197,9 +131,6 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) */ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { - struct idpf_buf_lifo *buf_stack; - int i; - /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ @@ -212,29 +143,6 @@ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) if (!tx_q->tx_buf) return -ENOMEM; - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) - return 0; - - buf_stack = &tx_q->stash->buf_stack; - - /* Initialize tx buf stack for out-of-order completions if - * flow scheduling offload is enabled - */ - buf_stack->bufs = kcalloc(tx_q->desc_count, sizeof(*buf_stack->bufs), - GFP_KERNEL); - if (!buf_stack->bufs) - return -ENOMEM; - - buf_stack->size = tx_q->desc_count; - buf_stack->top = tx_q->desc_count; - - for (i = 0; i < tx_q->desc_count; i++) { - buf_stack->bufs[i] = kzalloc(sizeof(*buf_stack->bufs[i]), - GFP_KERNEL); - if (!buf_stack->bufs[i]) - return -ENOMEM; - } - return 0; } @@ -349,8 +257,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) for (i = 0; i < vport->num_txq_grp; i++) { for (j = 0; j < vport->txq_grps[i].num_txq; j++) { struct idpf_tx_queue *txq = vport->txq_grps[i].txqs[j]; - u8 gen_bits = 0; - u16 bufidx_mask; err = idpf_tx_desc_alloc(vport, txq); if (err) { @@ -359,34 +265,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) i); goto err_out; } - - if (!idpf_is_queue_model_split(vport->txq_model)) - continue; - - txq->compl_tag_cur_gen = 0; - - /* Determine the number of bits in the bufid - * mask and add one to get the start of the - * generation bits - */ - bufidx_mask = txq->desc_count - 1; - while (bufidx_mask >> 1) { - txq->compl_tag_gen_s++; - bufidx_mask = bufidx_mask >> 1; - } - txq->compl_tag_gen_s++; - - gen_bits = IDPF_TX_SPLITQ_COMPL_TAG_WIDTH - - txq->compl_tag_gen_s; - txq->compl_tag_gen_max = GETMAXVAL(gen_bits); - - /* Set bufid mask based on location of first - * gen bit; it cannot simply be the descriptor - * ring size-1 since we can have size values - * where not all of those bits are set. - */ - txq->compl_tag_bufid_m = - GETMAXVAL(txq->compl_tag_gen_s); } if (!idpf_is_queue_model_split(vport->txq_model)) @@ -1025,9 +903,6 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) kfree(txq_grp->complq); txq_grp->complq = NULL; - - if (flow_sch_en) - kfree(txq_grp->stashes); } kfree(vport->txq_grps); vport->txq_grps = NULL; @@ -1386,7 +1261,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) for (i = 0; i < vport->num_txq_grp; i++) { struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i]; struct idpf_adapter *adapter = vport->adapter; - struct idpf_txq_stash *stashes; int j; tx_qgrp->vport = vport; @@ -1399,15 +1273,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) goto err_alloc; } - if (split && flow_sch_en) { - stashes = kcalloc(num_txq, sizeof(*stashes), - GFP_KERNEL); - if (!stashes) - goto err_alloc; - - tx_qgrp->stashes = stashes; - } - for (j = 0; j < tx_qgrp->num_txq; j++) { struct idpf_tx_queue *q = tx_qgrp->txqs[j]; @@ -1427,11 +1292,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) if (!flow_sch_en) continue; - if (split) { - q->stash = &stashes[j]; - hash_init(q->stash->sched_buf_hash); - } - idpf_queue_set(FLOW_SCH_EN, q); q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); @@ -1697,82 +1557,6 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q) wake_up(&vport->sw_marker_wq); } -/** - * idpf_tx_clean_stashed_bufs - clean bufs that were stored for - * out of order completions - * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @budget: Used to determine if we are in netpoll - */ -static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, - u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) -{ - struct idpf_tx_stash *stash; - struct hlist_node *tmp_buf; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = cleaned, - .napi = budget, - }; - - /* Buffer completion */ - hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, - hlist, compl_tag) { - if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag)) - continue; - - hash_del(&stash->hlist); - libeth_tx_complete(&stash->buf, &cp); - - /* Push shadow buf back onto stack */ - idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - } -} - -/** - * idpf_stash_flow_sch_buffers - store buffer parameters info to be freed at a - * later time (only relevant for flow scheduling mode) - * @txq: Tx queue to clean - * @tx_buf: buffer to store - */ -static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, - struct idpf_tx_buf *tx_buf) -{ - struct idpf_tx_stash *stash; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) - return 0; - - stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); - if (unlikely(!stash)) { - net_err_ratelimited("%s: No out-of-order TX buffers left!\n", - netdev_name(txq->netdev)); - - return -ENOMEM; - } - - /* Store buffer params in shadow buffer */ - stash->buf.skb = tx_buf->skb; - stash->buf.bytes = tx_buf->bytes; - stash->buf.packets = tx_buf->packets; - stash->buf.type = tx_buf->type; - stash->buf.nr_frags = tx_buf->nr_frags; - dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); - dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); - - /* Add buffer to buf_hash table to be freed later */ - hash_add(txq->stash->sched_buf_hash, &stash->hlist, - idpf_tx_buf_compl_tag(&stash->buf)); - - tx_buf->type = LIBETH_SQE_EMPTY; - - return 0; -} - #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ if (unlikely(++(ntc) == (txq)->desc_count)) { \ @@ -1800,14 +1584,8 @@ do { \ * Separate packet completion events will be reported on the completion queue, * and the buffers will be cleaned separately. The stats are not updated from * this function when using flow-based scheduling. - * - * Furthermore, in flow scheduling mode, check to make sure there are enough - * reserve buffers to stash the packet. If there are not, return early, which - * will leave next_to_clean pointing to the packet that failed to be stashed. - * - * Return: false in the scenario above, true otherwise. */ -static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, +static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, struct libeth_sq_napi_stats *cleaned, bool descs_only) @@ -1821,12 +1599,11 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, .napi = napi_budget, }; struct idpf_tx_buf *tx_buf; - bool clean_complete = true; if (descs_only) { /* Bump ring index to mark as cleaned. */ tx_q->next_to_clean = end; - return true; + return; } tx_desc = &tx_q->flex_tx[ntc]; @@ -1847,53 +1624,24 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, break; eop_idx = tx_buf->rs_idx; + libeth_tx_complete(tx_buf, &cp); - if (descs_only) { - if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) { - clean_complete = false; - goto tx_splitq_clean_out; - } - - idpf_stash_flow_sch_buffers(tx_q, tx_buf); + /* unmap remaining buffers */ + while (ntc != eop_idx) { + idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, + tx_desc, tx_buf); - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - idpf_stash_flow_sch_buffers(tx_q, tx_buf); - } - } else { + /* unmap any remaining paged data */ libeth_tx_complete(tx_buf, &cp); - - /* unmap remaining buffers */ - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - - /* unmap any remaining paged data */ - libeth_tx_complete(tx_buf, &cp); - } } fetch_next_txq_desc: idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); } -tx_splitq_clean_out: tx_q->next_to_clean = ntc; - - return clean_complete; } -#define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ -do { \ - (buf)++; \ - (ntc)++; \ - if (unlikely((ntc) == (txq)->desc_count)) { \ - buf = (txq)->tx_buf; \ - ntc = 0; \ - } \ -} while (0) - /** * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean @@ -1904,7 +1652,7 @@ do { \ * Clean all buffers associated with the packet starting at buf_id. Returns the * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, +static void idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, struct libeth_sq_napi_stats *cleaned, int budget) { @@ -1928,8 +1676,6 @@ static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, libeth_tx_complete(tx_buf, &cp); idpf_post_buf_refill(txq->refillq, buf_id); } - - return true; } /** @@ -1948,22 +1694,17 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct libeth_sq_napi_stats *cleaned, int budget) { - u16 compl_tag; + /* RS completion contains queue head for queue based scheduling or + * completion tag for flow based scheduling. + */ + u16 rs_compl_val = le16_to_cpu(desc->q_head_compl_tag.q_head); if (!idpf_queue_has(FLOW_SCH_EN, txq)) { - u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - - idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, rs_compl_val, budget, cleaned, false); return; } - compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - - /* If we didn't clean anything on the ring, this packet must be - * in the hash table. Go clean it there. - */ - if (!idpf_tx_clean_bufs(txq, compl_tag, cleaned, budget)) - idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); + idpf_tx_clean_bufs(txq, rs_compl_val, cleaned, budget); } /** @@ -2080,8 +1821,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, /* Update BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - dont_wake = !complq_ok || IDPF_TX_BUF_RSV_LOW(tx_q) || - np->state != __IDPF_VPORT_UP || + dont_wake = !complq_ok || np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); /* Check if the TXQ needs to and can be restarted */ __netif_txq_completed_wake(nq, tx_q->cleaned_pkts, tx_q->cleaned_bytes, @@ -2152,7 +1892,6 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q) || idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; @@ -2276,10 +2015,8 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) { ntu++; - if (ntu == txq->desc_count) { + if (ntu == txq->desc_count) ntu = 0; - txq->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(txq); - } return ntu; } @@ -2461,8 +2198,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = - IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { tx_desc++; } @@ -2493,7 +2228,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { tx_desc++; } @@ -2854,10 +2588,9 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; - /* Set the RE bit to catch any packets that may have not been - * stashed during RS completion cleaning. MIN_GAP is set to - * MIN_RING size to ensure it will be set at least once each - * time around the ring. + /* Set the RE bit to periodically "clean" the descriptor ring. + * MIN_GAP is set to MIN_RING size to ensure it will be set at + * least once each time around the ring. */ if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index cfa6aa5b3bb8f..1011e25e9f925 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -123,10 +123,6 @@ do { \ ((((txq)->next_to_clean > (txq)->next_to_use) ? 0 : (txq)->desc_count) + \ (txq)->next_to_clean - (txq)->next_to_use - 1) -#define IDPF_TX_BUF_RSV_UNUSED(txq) ((txq)->stash->buf_stack.top) -#define IDPF_TX_BUF_RSV_LOW(txq) (IDPF_TX_BUF_RSV_UNUSED(txq) < \ - (txq)->desc_count >> 2) - #define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 1) /* Determine the absolute number of completions pending, i.e. the number of * completions that are expected to arrive on the TX completion queue. @@ -136,12 +132,6 @@ do { \ 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) -#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -/* Adjust the generation for the completion tag and wrap if necessary */ -#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ - ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ - 0 : (txq)->compl_tag_cur_gen) - #define IDPF_TXBUF_NULL U32_MAX #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) @@ -158,18 +148,6 @@ union idpf_tx_flex_desc { #define idpf_tx_buf libeth_sqe -/** - * struct idpf_buf_lifo - LIFO for managing OOO completions - * @top: Used to know how many buffers are left - * @size: Total size of LIFO - * @bufs: Backing array - */ -struct idpf_buf_lifo { - u16 top; - u16 size; - struct idpf_tx_stash **bufs; -}; - /** * struct idpf_tx_offload_params - Offload parameters for a given packet * @tx_flags: Feature flags enabled for this packet @@ -573,17 +551,6 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) #define IDPF_DIM_DEFAULT_PROFILE_IX 1 -/** - * struct idpf_txq_stash - Tx buffer stash for Flow-based scheduling mode - * @buf_stack: Stack of empty buffers to store buffer info for out of order - * buffer completions. See struct idpf_buf_lifo - * @sched_buf_hash: Hash table to store buffers - */ -struct idpf_txq_stash { - struct idpf_buf_lifo buf_stack; - DECLARE_HASHTABLE(sched_buf_hash, 12); -} ____cacheline_aligned; - /** * struct idpf_rx_queue - software structure representing a receive queue * @rx: universal receive descriptor array @@ -696,7 +663,6 @@ struct idpf_rx_queue { * @cleaned_pkts: Number of packets cleaned for the above said case * @tx_min_pkt_len: Min supported packet length * @refillq: Pointer to refill queue - * @compl_tag_bufid_m: Completion tag buffer id mask * @compl_tag_gen_s: Completion tag generation bit * The format of the completion tag will change based on the TXQ * descriptor ring size so that we can maintain roughly the same level @@ -717,9 +683,6 @@ struct idpf_rx_queue { * -------------------------------- * * This gives us 8*8160 = 65280 possible unique values. - * @compl_tag_cur_gen: Used to keep track of current completion tag generation - * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset - * @stash: Tx buffer stash for Flow-based scheduling mode * @stats_sync: See struct u64_stats_sync * @q_stats: See union idpf_tx_queue_stats * @q_id: Queue id @@ -761,14 +724,6 @@ struct idpf_tx_queue { u16 tx_min_pkt_len; struct idpf_sw_queue *refillq; - u16 compl_tag_bufid_m; - u16 compl_tag_gen_s; - - u16 compl_tag_cur_gen; - u16 compl_tag_gen_max; - - struct idpf_txq_stash *stash; - struct u64_stats_sync stats_sync; struct idpf_tx_queue_stats q_stats; @@ -964,7 +919,6 @@ struct idpf_rxq_group { * @vport: Vport back pointer * @num_txq: Number of TX queues associated * @txqs: Array of TX queue pointers - * @stashes: array of OOO stashes for the queues * @complq: Associated completion queue pointer, split queue only * @num_completions_pending: Total number of completions pending for the * completion queue, acculumated for all TX queues @@ -979,7 +933,6 @@ struct idpf_txq_group { u16 num_txq; struct idpf_tx_queue *txqs[IDPF_LARGE_MAX_Q]; - struct idpf_txq_stash *stashes; struct idpf_compl_queue *complq;