From b6cf8b3f3312e2ffe38f23c7c692eb9389277a9a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 9 Jul 2020 15:29:42 +0206 Subject: printk: add lockless ringbuffer Introduce a multi-reader multi-writer lockless ringbuffer for storing the kernel log messages. Readers and writers may use their API from any context (including scheduler and NMI). This ringbuffer will make it possible to decouple printk() callers from any context, locking, or console constraints. It also makes it possible for readers to have full access to the ringbuffer contents at any time and context (for example from any panic situation). The printk_ringbuffer is made up of 3 internal ringbuffers: desc_ring: A ring of descriptors. A descriptor contains all record meta data (sequence number, timestamp, loglevel, etc.) as well as internal state information about the record and logical positions specifying where in the other ringbuffers the text and dictionary strings are located. text_data_ring: A ring of data blocks. A data block consists of an unsigned long integer (ID) that maps to a desc_ring index followed by the text string of the record. dict_data_ring: A ring of data blocks. A data block consists of an unsigned long integer (ID) that maps to a desc_ring index followed by the dictionary string of the record. The internal state information of a descriptor is the key element to allow readers and writers to locklessly synchronize access to the data. Co-developed-by: Petr Mladek Signed-off-by: John Ogness Reviewed-by: Petr Mladek Reviewed-by: Paul E. McKenney Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200709132344.760-3-john.ogness@linutronix.de --- kernel/printk/Makefile | 1 + kernel/printk/printk_ringbuffer.c | 1687 +++++++++++++++++++++++++++++++++++++ kernel/printk/printk_ringbuffer.h | 399 +++++++++ 3 files changed, 2087 insertions(+) create mode 100644 kernel/printk/printk_ringbuffer.c create mode 100644 kernel/printk/printk_ringbuffer.h (limited to 'kernel/printk') diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..eee3dc9b60a9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,3 +2,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o +obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c new file mode 100644 index 000000000000..7355ca99e852 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.c @@ -0,0 +1,1687 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "printk_ringbuffer.h" + +/** + * DOC: printk_ringbuffer overview + * + * Data Structure + * -------------- + * The printk_ringbuffer is made up of 3 internal ringbuffers: + * + * desc_ring + * A ring of descriptors. A descriptor contains all record meta data + * (sequence number, timestamp, loglevel, etc.) as well as internal state + * information about the record and logical positions specifying where in + * the other ringbuffers the text and dictionary strings are located. + * + * text_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the text + * string of the record. + * + * dict_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the dictionary + * string of the record. + * + * The internal state information of a descriptor is the key element to allow + * readers and writers to locklessly synchronize access to the data. + * + * Implementation + * -------------- + * + * Descriptor Ring + * ~~~~~~~~~~~~~~~ + * The descriptor ring is an array of descriptors. A descriptor contains all + * the meta data of a printk record as well as blk_lpos structs pointing to + * associated text and dictionary data blocks (see "Data Rings" below). Each + * descriptor is assigned an ID that maps directly to index values of the + * descriptor array and has a state. The ID and the state are bitwise combined + * into a single descriptor field named @state_var, allowing ID and state to + * be synchronously and atomically updated. + * + * Descriptors have three states: + * + * reserved + * A writer is modifying the record. + * + * committed + * The record and all its data are complete and available for reading. + * + * reusable + * The record exists, but its text and/or dictionary data may no longer + * be available. + * + * Querying the @state_var of a record requires providing the ID of the + * descriptor to query. This can yield a possible fourth (pseudo) state: + * + * miss + * The descriptor being queried has an unexpected ID. + * + * The descriptor ring has a @tail_id that contains the ID of the oldest + * descriptor and @head_id that contains the ID of the newest descriptor. + * + * When a new descriptor should be created (and the ring is full), the tail + * descriptor is invalidated by first transitioning to the reusable state and + * then invalidating all tail data blocks up to and including the data blocks + * associated with the tail descriptor (for text and dictionary rings). Then + * @tail_id is advanced, followed by advancing @head_id. And finally the + * @state_var of the new descriptor is initialized to the new ID and reserved + * state. + * + * The @tail_id can only be advanced if the new @tail_id would be in the + * committed or reusable queried state. This makes it possible that a valid + * sequence number of the tail is always available. + * + * Data Rings + * ~~~~~~~~~~ + * The two data rings (text and dictionary) function identically. They exist + * separately so that their buffer sizes can be individually set and they do + * not affect one another. + * + * Data rings are byte arrays composed of data blocks. Data blocks are + * referenced by blk_lpos structs that point to the logical position of the + * beginning of a data block and the beginning of the next adjacent data + * block. Logical positions are mapped directly to index values of the byte + * array ringbuffer. + * + * Each data block consists of an ID followed by the writer data. The ID is + * the identifier of a descriptor that is associated with the data block. A + * given data block is considered valid if all of the following conditions + * are met: + * + * 1) The descriptor associated with the data block is in the committed + * queried state. + * + * 2) The blk_lpos struct within the descriptor associated with the data + * block references back to the same data block. + * + * 3) The data block is within the head/tail logical position range. + * + * If the writer data of a data block would extend beyond the end of the + * byte array, only the ID of the data block is stored at the logical + * position and the full data block (ID and writer data) is stored at the + * beginning of the byte array. The referencing blk_lpos will point to the + * ID before the wrap and the next data block will be at the logical + * position adjacent the full data block after the wrap. + * + * Data rings have a @tail_lpos that points to the beginning of the oldest + * data block and a @head_lpos that points to the logical position of the + * next (not yet existing) data block. + * + * When a new data block should be created (and the ring is full), tail data + * blocks will first be invalidated by putting their associated descriptors + * into the reusable state and then pushing the @tail_lpos forward beyond + * them. Then the @head_lpos is pushed forward and is associated with a new + * descriptor. If a data block is not valid, the @tail_lpos cannot be + * advanced beyond it. + * + * Usage + * ----- + * Here are some simple examples demonstrating writers and readers. For the + * examples a global ringbuffer (test_rb) is available (which is not the + * actual ringbuffer used by printk):: + * + * DEFINE_PRINTKRB(test_rb, 15, 5, 3); + * + * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of + * 1 MiB (2 ^ (15 + 5)) for text data and 256 KiB (2 ^ (15 + 3)) for + * dictionary data. + * + * Sample writer code:: + * + * const char *dictstr = "dictionary text"; + * const char *textstr = "message text"; + * struct prb_reserved_entry e; + * struct printk_record r; + * + * // specify how much to allocate + * prb_rec_init_wr(&r, strlen(textstr) + 1, strlen(dictstr) + 1); + * + * if (prb_reserve(&e, &test_rb, &r)) { + * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * + * // dictionary allocation may have failed + * if (r.dict_buf) + * snprintf(r.dict_buf, r.dict_buf_size, "%s", dictstr); + * + * r.info->ts_nsec = local_clock(); + * + * prb_commit(&e); + * } + * + * Sample reader code:: + * + * struct printk_info info; + * struct printk_record r; + * char text_buf[32]; + * char dict_buf[32]; + * u64 seq; + * + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf), + * &dict_buf[0], sizeof(dict_buf)); + * + * prb_for_each_record(0, &test_rb, &seq, &r) { + * if (info.seq != seq) + * pr_warn("lost %llu records\n", info.seq - seq); + * + * if (info.text_len > r.text_buf_size) { + * pr_warn("record %llu text truncated\n", info.seq); + * text_buf[r.text_buf_size - 1] = 0; + * } + * + * if (info.dict_len > r.dict_buf_size) { + * pr_warn("record %llu dict truncated\n", info.seq); + * dict_buf[r.dict_buf_size - 1] = 0; + * } + * + * pr_info("%llu: %llu: %s;%s\n", info.seq, info.ts_nsec, + * &text_buf[0], info.dict_len ? &dict_buf[0] : ""); + * } + * + * Note that additional less convenient reader functions are available to + * allow complex record access. + * + * ABA Issues + * ~~~~~~~~~~ + * To help avoid ABA issues, descriptors are referenced by IDs (array index + * values combined with tagged bits counting array wraps) and data blocks are + * referenced by logical positions (array index values combined with tagged + * bits counting array wraps). However, on 32-bit systems the number of + * tagged bits is relatively small such that an ABA incident is (at least + * theoretically) possible. For example, if 4 million maximally sized (1KiB) + * printk messages were to occur in NMI context on a 32-bit system, the + * interrupted context would not be able to recognize that the 32-bit integer + * completely wrapped and thus represents a different data block than the one + * the interrupted context expects. + * + * To help combat this possibility, additional state checking is performed + * (such as using cmpxchg() even though set() would suffice). These extra + * checks are commented as such and will hopefully catch any ABA issue that + * a 32-bit system might experience. + * + * Memory Barriers + * ~~~~~~~~~~~~~~~ + * Multiple memory barriers are used. To simplify proving correctness and + * generating litmus tests, lines of code related to memory barriers + * (loads, stores, and the associated memory barriers) are labeled:: + * + * LMM(function:letter) + * + * Comments reference the labels using only the "function:letter" part. + * + * The memory barrier pairs and their ordering are: + * + * desc_reserve:D / desc_reserve:B + * push descriptor tail (id), then push descriptor head (id) + * + * desc_reserve:D / data_push_tail:B + * push data tail (lpos), then set new descriptor reserved (state) + * + * desc_reserve:D / desc_push_tail:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:D / prb_first_seq:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:F / desc_read:D + * set new descriptor id and reserved (state), then allow writer changes + * + * data_alloc:A / desc_read:D + * set old descriptor reusable (state), then modify new data block area + * + * data_alloc:A / data_push_tail:B + * push data tail (lpos), then modify new data block area + * + * prb_commit:B / desc_read:B + * store writer changes, then set new descriptor committed (state) + * + * data_push_tail:D / data_push_tail:A + * set descriptor reusable (state), then push data tail (lpos) + * + * desc_push_tail:B / desc_reserve:D + * set descriptor reusable (state), then push descriptor tail (id) + */ + +#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) +#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) + +#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) +#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) + +/* Determine the data array index from a logical position. */ +#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) + +/* Determine the desc array index from an ID or sequence number. */ +#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) + +/* Determine how many times the data array has wrapped. */ +#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) + +/* Get the logical position at index 0 of the current wrap. */ +#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ +((lpos) & ~DATA_SIZE_MASK(data_ring)) + +/* Get the ID for the same index of the previous wrap as the given ID. */ +#define DESC_ID_PREV_WRAP(desc_ring, id) \ +DESC_ID((id) - DESCS_COUNT(desc_ring)) + +/* + * A data block: mapped directly to the beginning of the data block area + * specified as a logical position within the data ring. + * + * @id: the ID of the associated descriptor + * @data: the writer data + * + * Note that the size of a data block is only known by its associated + * descriptor. + */ +struct prb_data_block { + unsigned long id; + char data[0]; +}; + +/* + * Return the descriptor associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; +} + +static struct prb_data_block *to_block(struct prb_data_ring *data_ring, + unsigned long begin_lpos) +{ + return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; +} + +/* + * Increase the data size to account for data block meta data plus any + * padding so that the adjacent data block is aligned on the ID size. + */ +static unsigned int to_blk_size(unsigned int size) +{ + struct prb_data_block *db = NULL; + + size += sizeof(*db); + size = ALIGN(size, sizeof(db->id)); + return size; +} + +/* + * Sanity checker for reserve size. The ringbuffer code assumes that a data + * block does not exceed the maximum possible size that could fit within the + * ringbuffer. This function provides that basic size check so that the + * assumption is safe. + * + * Writers are also not allowed to write 0-sized (data-less) records. Such + * records are used only internally by the ringbuffer. + */ +static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) +{ + struct prb_data_block *db = NULL; + + /* + * Writers are not allowed to write data-less records. Such records + * are used only internally by the ringbuffer to denote records where + * their data failed to allocate or have been lost. + */ + if (size == 0) + return false; + + /* + * Ensure the alignment padded size could possibly fit in the data + * array. The largest possible data block must still leave room for + * at least the ID of the next block. + */ + size = to_blk_size(size); + if (size > DATA_SIZE(data_ring) - sizeof(db->id)) + return false; + + return true; +} + +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss, /* ID mismatch */ + desc_reserved, /* reserved, in use by writer */ + desc_committed, /* committed, writer is done */ + desc_reusable, /* free, not yet used by any writer */ +}; + +/* Query the state of a descriptor. */ +static enum desc_state get_desc_state(unsigned long id, + unsigned long state_val) +{ + if (id != DESC_ID(state_val)) + return desc_miss; + + if (state_val & DESC_REUSE_MASK) + return desc_reusable; + + if (state_val & DESC_COMMITTED_MASK) + return desc_committed; + + return desc_reserved; +} + +/* + * Get a copy of a specified descriptor and its queried state. A descriptor + * that is not in the committed or reusable state must be considered garbage + * by the reader. + */ +static enum desc_state desc_read(struct prb_desc_ring *desc_ring, + unsigned long id, struct prb_desc *desc_out) +{ + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + enum desc_state d_state; + unsigned long state_val; + + /* Check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ + d_state = get_desc_state(id, state_val); + if (d_state != desc_committed && d_state != desc_reusable) + return d_state; + + /* + * Guarantee the state is loaded before copying the descriptor + * content. This avoids copying obsolete descriptor content that might + * not apply to the descriptor state. This pairs with prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_read:A reads from prb_commit:B, then desc_read:C reads + * from prb_commit:A. + * + * Relies on: + * + * WMB from prb_commit:A to prb_commit:B + * matching + * RMB from desc_read:A to desc_read:C + */ + smp_rmb(); /* LMM(desc_read:B) */ + + /* + * Copy the descriptor data. The data is not valid until the + * state has been re-checked. + */ + memcpy(desc_out, desc, sizeof(*desc_out)); /* LMM(desc_read:C) */ + + /* + * 1. Guarantee the descriptor content is loaded before re-checking + * the state. This avoids reading an obsolete descriptor state + * that may not apply to the copied content. This pairs with + * desc_reserve:F. + * + * Memory barrier involvement: + * + * If desc_read:C reads from desc_reserve:G, then desc_read:E + * reads from desc_reserve:F. + * + * Relies on: + * + * WMB from desc_reserve:F to desc_reserve:G + * matching + * RMB from desc_read:C to desc_read:E + * + * 2. Guarantee the record data is loaded before re-checking the + * state. This avoids reading an obsolete descriptor state that may + * not apply to the copied data. This pairs with data_alloc:A. + * + * Memory barrier involvement: + * + * If copy_data:A reads from data_alloc:B, then desc_read:E + * reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_alloc:B + * matching + * RMB from desc_read:C to desc_read:E + * + * Note: desc_make_reusable:A and data_alloc:B can be different + * CPUs. However, the data_alloc:B CPU (which performs the + * full memory barrier) must have previously seen + * desc_make_reusable:A. + */ + smp_rmb(); /* LMM(desc_read:D) */ + + /* Re-check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ + return get_desc_state(id, state_val); +} + +/* + * Take a specified descriptor out of the committed state by attempting + * the transition from committed to reusable. Either this context or some + * other context will have been successful. + */ +static void desc_make_reusable(struct prb_desc_ring *desc_ring, + unsigned long id) +{ + unsigned long val_committed = id | DESC_COMMITTED_MASK; + unsigned long val_reusable = val_committed | DESC_REUSE_MASK; + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + + atomic_long_cmpxchg_relaxed(state_var, val_committed, + val_reusable); /* LMM(desc_make_reusable:A) */ +} + +/* + * Given a data ring (text or dict), put the associated descriptor of each + * data block from @lpos_begin until @lpos_end into the reusable state. + * + * If there is any problem making the associated descriptor reusable, either + * the descriptor has not yet been committed or another writer context has + * already pushed the tail lpos past the problematic data block. Regardless, + * on error the caller can re-load the tail lpos to determine the situation. + */ +static bool data_make_reusable(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos_begin, + unsigned long lpos_end, + unsigned long *lpos_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_data_blk_lpos *blk_lpos; + struct prb_data_block *blk; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + + /* + * Using the provided @data_ring, point @blk_lpos to the correct + * blk_lpos within the local copy of the descriptor. + */ + if (data_ring == &rb->text_data_ring) + blk_lpos = &desc.text_blk_lpos; + else + blk_lpos = &desc.dict_blk_lpos; + + /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ + while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, lpos_begin); + + /* + * Load the block ID from the data block. This is a data race + * against a writer that may have newly reserved this data + * area. If the loaded value matches a valid descriptor ID, + * the blk_lpos of that descriptor will be checked to make + * sure it points back to this data block. If the check fails, + * the data area has been recycled by another writer. + */ + id = blk->id; /* LMM(data_make_reusable:A) */ + + d_state = desc_read(desc_ring, id, &desc); /* LMM(data_make_reusable:B) */ + + switch (d_state) { + case desc_miss: + return false; + case desc_reserved: + return false; + case desc_committed: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + desc_make_reusable(desc_ring, id); + break; + case desc_reusable: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + break; + } + + /* Advance @lpos_begin to the next data block. */ + lpos_begin = blk_lpos->next; + } + + *lpos_out = lpos_begin; + return true; +} + +/* + * Advance the data ring tail to at least @lpos. This function puts + * descriptors into the reusable state if the tail is pushed beyond + * their associated data block. + */ +static bool data_push_tail(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos) +{ + unsigned long tail_lpos_new; + unsigned long tail_lpos; + unsigned long next_lpos; + + /* If @lpos is not valid, there is nothing to do. */ + if (lpos == INVALID_LPOS) + return true; + + /* + * Any descriptor states that have transitioned to reusable due to the + * data tail being pushed to this loaded value will be visible to this + * CPU. This pairs with data_push_tail:D. + * + * Memory barrier involvement: + * + * If data_push_tail:A reads from data_push_tail:D, then this CPU can + * see desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_push_tail:D + * matches + * READFROM from data_push_tail:D to data_push_tail:A + * thus + * READFROM from desc_make_reusable:A to this CPU + */ + tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ + + /* + * Loop until the tail lpos is at or beyond @lpos. This condition + * may already be satisfied, resulting in no full memory barrier + * from data_push_tail:D being performed. However, since this CPU + * sees the new tail lpos, any descriptor states that transitioned to + * the reusable state must already be visible. + */ + while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + /* + * Make all descriptors reusable that are associated with + * data blocks before @lpos. + */ + if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, + &next_lpos)) { + /* + * 1. Guarantee the block ID loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled data area causing the tail lpos to + * have been previously pushed. This pairs with + * data_alloc:A. + * + * Memory barrier involvement: + * + * If data_make_reusable:A reads from data_alloc:B, + * then data_push_tail:C reads from + * data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to data_alloc:B + * matching + * RMB from data_make_reusable:A to + * data_push_tail:C + * + * Note: data_push_tail:D and data_alloc:B can be + * different CPUs. However, the data_alloc:B + * CPU (which performs the full memory + * barrier) must have previously seen + * data_push_tail:D. + * + * 2. Guarantee the descriptor state loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled descriptor causing the tail lpos to + * have been previously pushed. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If data_make_reusable:B reads from + * desc_reserve:F, then data_push_tail:C reads + * from data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to desc_reserve:F + * matching + * RMB from data_make_reusable:B to + * data_push_tail:C + * + * Note: data_push_tail:D and desc_reserve:F can + * be different CPUs. However, the + * desc_reserve:F CPU (which performs the + * full memory barrier) must have previously + * seen data_push_tail:D. + */ + smp_rmb(); /* LMM(data_push_tail:B) */ + + tail_lpos_new = atomic_long_read(&data_ring->tail_lpos + ); /* LMM(data_push_tail:C) */ + if (tail_lpos_new == tail_lpos) + return false; + + /* Another CPU pushed the tail. Try again. */ + tail_lpos = tail_lpos_new; + continue; + } + + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail lpos. A full + * memory barrier is needed since other CPUs may have made + * the descriptor states reusable. This pairs with + * data_push_tail:A. + */ + if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, + next_lpos)) { /* LMM(data_push_tail:D) */ + break; + } + } + + return true; +} + +/* + * Advance the desc ring tail. This function advances the tail by one + * descriptor, thus invalidating the oldest descriptor. Before advancing + * the tail, the tail descriptor is made reusable and all data blocks up to + * and including the descriptor's data block are invalidated (i.e. the data + * ring tail is pushed past the data block of the descriptor being made + * reusable). + */ +static bool desc_push_tail(struct printk_ringbuffer *rb, + unsigned long tail_id) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + + d_state = desc_read(desc_ring, tail_id, &desc); + + switch (d_state) { + case desc_miss: + /* + * If the ID is exactly 1 wrap behind the expected, it is + * in the process of being reserved by another writer and + * must be considered reserved. + */ + if (DESC_ID(atomic_long_read(&desc.state_var)) == + DESC_ID_PREV_WRAP(desc_ring, tail_id)) { + return false; + } + + /* + * The ID has changed. Another writer must have pushed the + * tail and recycled the descriptor already. Success is + * returned because the caller is only interested in the + * specified tail being pushed, which it was. + */ + return true; + case desc_reserved: + return false; + case desc_committed: + desc_make_reusable(desc_ring, tail_id); + break; + case desc_reusable: + break; + } + + /* + * Data blocks must be invalidated before their associated + * descriptor can be made available for recycling. Invalidating + * them later is not possible because there is no way to trust + * data blocks once their associated descriptor is gone. + */ + + if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) + return false; + if (!data_push_tail(rb, &rb->dict_data_ring, desc.dict_blk_lpos.next)) + return false; + + /* + * Check the next descriptor after @tail_id before pushing the tail + * to it because the tail must always be in a committed or reusable + * state. The implementation of prb_first_seq() relies on this. + * + * A successful read implies that the next descriptor is less than or + * equal to @head_id so there is no risk of pushing the tail past the + * head. + */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc); /* LMM(desc_push_tail:A) */ + + if (d_state == desc_committed || d_state == desc_reusable) { + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail ID. This allows + * verifying the recycled descriptor state. A full memory + * barrier is needed since other CPUs may have made the + * descriptor states reusable. This pairs with desc_reserve:D. + */ + atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, + DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ + } else { + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail ID in the + * case that the descriptor has been recycled. This pairs + * with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_push_tail:A reads from desc_reserve:F, then + * desc_push_tail:D reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB from desc_push_tail:A to desc_push_tail:D + * + * Note: desc_push_tail:B and desc_reserve:F can be different + * CPUs. However, the desc_reserve:F CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_push_tail:C) */ + + /* + * Re-check the tail ID. The descriptor following @tail_id is + * not in an allowed tail state. But if the tail has since + * been moved by another CPU, then it does not matter. + */ + if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ + return false; + } + + return true; +} + +/* Reserve a new descriptor, invalidating the oldest if necessary. */ +static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long prev_state_val; + unsigned long id_prev_wrap; + struct prb_desc *desc; + unsigned long head_id; + unsigned long id; + + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ + + do { + desc = to_desc(desc_ring, head_id); + + id = DESC_ID(head_id + 1); + id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); + + /* + * Guarantee the head ID is read before reading the tail ID. + * Since the tail ID is updated before the head ID, this + * guarantees that @id_prev_wrap is never ahead of the tail + * ID. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_reserve:A reads from desc_reserve:D, then + * desc_reserve:C reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:D + * matching + * RMB from desc_reserve:A to desc_reserve:C + * + * Note: desc_push_tail:B and desc_reserve:D can be different + * CPUs. However, the desc_reserve:D CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_reserve:B) */ + + if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id + )) { /* LMM(desc_reserve:C) */ + /* + * Make space for the new descriptor by + * advancing the tail. + */ + if (!desc_push_tail(rb, id_prev_wrap)) + return false; + } + + /* + * 1. Guarantee the tail ID is read before validating the + * recycled descriptor state. A read memory barrier is + * sufficient for this. This pairs with desc_push_tail:B. + * + * Memory barrier involvement: + * + * If desc_reserve:C reads from desc_push_tail:B, then + * desc_reserve:E reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to desc_push_tail:B + * matching + * RMB from desc_reserve:C to desc_reserve:E + * + * Note: desc_make_reusable:A and desc_push_tail:B can be + * different CPUs. However, the desc_push_tail:B CPU + * (which performs the full memory barrier) must have + * previously seen desc_make_reusable:A. + * + * 2. Guarantee the tail ID is stored before storing the head + * ID. This pairs with desc_reserve:B. + * + * 3. Guarantee any data ring tail changes are stored before + * recycling the descriptor. Data ring tail changes can + * happen via desc_push_tail()->data_push_tail(). A full + * memory barrier is needed since another CPU may have + * pushed the data ring tails. This pairs with + * data_push_tail:B. + * + * 4. Guarantee a new tail ID is stored before recycling the + * descriptor. A full memory barrier is needed since + * another CPU may have pushed the tail ID. This pairs + * with desc_push_tail:C and this also pairs with + * prb_first_seq:C. + */ + } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, + id)); /* LMM(desc_reserve:D) */ + + desc = to_desc(desc_ring, id); + + /* + * If the descriptor has been recycled, verify the old state val. + * See "ABA Issues" about why this verification is performed. + */ + prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ + if (prev_state_val && + prev_state_val != (id_prev_wrap | DESC_COMMITTED_MASK | DESC_REUSE_MASK)) { + WARN_ON_ONCE(1); + return false; + } + + /* + * Assign the descriptor a new ID and set its state to reserved. + * See "ABA Issues" about why cmpxchg() instead of set() is used. + * + * Guarantee the new descriptor ID and state is stored before making + * any other changes. A write memory barrier is sufficient for this. + * This pairs with desc_read:D. + */ + if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, + id | 0)) { /* LMM(desc_reserve:F) */ + WARN_ON_ONCE(1); + return false; + } + + /* Now data in @desc can be modified: LMM(desc_reserve:G) */ + + *id_out = id; + return true; +} + +/* Determine the end of a data block. */ +static unsigned long get_next_lpos(struct prb_data_ring *data_ring, + unsigned long lpos, unsigned int size) +{ + unsigned long begin_lpos; + unsigned long next_lpos; + + begin_lpos = lpos; + next_lpos = lpos + size; + + /* First check if the data block does not wrap. */ + if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + return next_lpos; + + /* Wrapping data blocks store their data at the beginning. */ + return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); +} + +/* + * Allocate a new data block, invalidating the oldest data block(s) + * if necessary. This function also associates the data block with + * a specified descriptor. + */ +static char *data_alloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long begin_lpos; + unsigned long next_lpos; + + if (size == 0) { + /* Specify a data-less block. */ + blk_lpos->begin = INVALID_LPOS; + blk_lpos->next = INVALID_LPOS; + return NULL; + } + + size = to_blk_size(size); + + begin_lpos = atomic_long_read(&data_ring->head_lpos); + + do { + next_lpos = get_next_lpos(data_ring, begin_lpos, size); + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { + /* Failed to allocate, specify a data-less block. */ + blk_lpos->begin = INVALID_LPOS; + blk_lpos->next = INVALID_LPOS; + return NULL; + } + + /* + * 1. Guarantee any descriptor states that have transitioned + * to reusable are stored before modifying the newly + * allocated data area. A full memory barrier is needed + * since other CPUs may have made the descriptor states + * reusable. See data_push_tail:A about why the reusable + * states are visible. This pairs with desc_read:D. + * + * 2. Guarantee any updated tail lpos is stored before + * modifying the newly allocated data area. Another CPU may + * be in data_make_reusable() and is reading a block ID + * from this area. data_make_reusable() can handle reading + * a garbage block ID value, but then it must be able to + * load a new tail lpos. A full memory barrier is needed + * since other CPUs may have updated the tail lpos. This + * pairs with data_push_tail:B. + */ + } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, + next_lpos)); /* LMM(data_alloc:A) */ + + blk = to_block(data_ring, begin_lpos); + blk->id = id; /* LMM(data_alloc:B) */ + + if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + } + + blk_lpos->begin = begin_lpos; + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* Return the number of bytes used by a data block. */ +static unsigned int space_used(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos) +{ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + /* Data block does not wrap. */ + return (DATA_INDEX(data_ring, blk_lpos->next) - + DATA_INDEX(data_ring, blk_lpos->begin)); + } + + /* + * For wrapping data blocks, the trailing (wasted) space is + * also counted. + */ + return (DATA_INDEX(data_ring, blk_lpos->next) + + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); +} + +/** + * prb_reserve() - Reserve space in the ringbuffer. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to reserve data in. + * @r: The record structure to allocate buffers for. + * + * This is the public function available to writers to reserve data. + * + * The writer specifies the text and dict sizes to reserve by setting the + * @text_buf_size and @dict_buf_size fields of @r, respectively. Dictionaries + * are optional, so @dict_buf_size is allowed to be 0. To ensure proper + * initialization of @r, prb_rec_init_wr() should be used. + * + * Context: Any context. Disables local interrupts on success. + * Return: true if at least text data could be allocated, otherwise false. + * + * On success, the fields @info, @text_buf, @dict_buf of @r will be set by + * this function and should be filled in by the writer before committing. Also + * on success, prb_record_text_space() can be used on @e to query the actual + * space used for the text data block. + * + * If the function fails to reserve dictionary space (but all else succeeded), + * it will still report success. In that case @dict_buf is set to NULL and + * @dict_buf_size is set to 0. Writers must check this before writing to + * dictionary space. + * + * @info->text_len and @info->dict_len will already be set to @text_buf_size + * and @dict_buf_size, respectively. If dictionary space reservation fails, + * @info->dict_len is set to 0. + */ +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_desc *d; + unsigned long id; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + /* Records are allowed to not have dictionaries. */ + if (r->dict_buf_size) { + if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) + goto fail; + } + + /* + * Descriptors in the reserved state act as blockers to all further + * reservations once the desc_ring has fully wrapped. Disable + * interrupts during the reserve/commit window in order to minimize + * the likelihood of this happening. + */ + local_irq_save(e->irqflags); + + if (!desc_reserve(rb, &id)) { + /* Descriptor reservation failures are tracked. */ + atomic_long_inc(&rb->fail); + local_irq_restore(e->irqflags); + goto fail; + } + + d = to_desc(desc_ring, id); + + /* + * Set the @e fields here so that prb_commit() can be used if + * text data allocation fails. + */ + e->rb = rb; + e->id = id; + + /* + * Initialize the sequence number if it has "never been set". + * Otherwise just increment it by a full wrap. + * + * @seq is considered "never been set" if it has a value of 0, + * _except_ for @descs[0], which was specially setup by the ringbuffer + * initializer and therefore is always considered as set. + * + * See the "Bootstrap" comment block in printk_ringbuffer.h for + * details about how the initializer bootstraps the descriptors. + */ + if (d->info.seq == 0 && DESC_INDEX(desc_ring, id) != 0) + d->info.seq = DESC_INDEX(desc_ring, id); + else + d->info.seq += DESCS_COUNT(desc_ring); + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + /* If text data allocation fails, a data-less record is committed. */ + if (r->text_buf_size && !r->text_buf) { + d->info.text_len = 0; + d->info.dict_len = 0; + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ + goto fail; + } + + r->dict_buf = data_alloc(rb, &rb->dict_data_ring, r->dict_buf_size, + &d->dict_blk_lpos, id); + /* + * If dict data allocation fails, the caller can still commit + * text. But dictionary information will not be available. + */ + if (r->dict_buf_size && !r->dict_buf) + r->dict_buf_size = 0; + + r->info = &d->info; + + /* Set default values for the sizes. */ + d->info.text_len = r->text_buf_size; + d->info.dict_len = r->dict_buf_size; + + /* Record full text space used by record. */ + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + /* Make it clear to the caller that the reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + struct prb_desc *d = to_desc(desc_ring, e->id); + unsigned long prev_state_val = e->id | 0; + + /* Now the writer has finished all writing: LMM(prb_commit:A) */ + + /* + * Set the descriptor as committed. See "ABA Issues" about why + * cmpxchg() instead of set() is used. + * + * Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient for + * this. This pairs with desc_read:B. + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + e->id | DESC_COMMITTED_MASK)) { /* LMM(prb_commit:B) */ + WARN_ON_ONCE(1); + } + + /* Restore interrupts, the reserve/commit window is finished. */ + local_irq_restore(e->irqflags); +} + +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (blk_lpos->begin == INVALID_LPOS && + blk_lpos->next == INVALID_LPOS) { + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + +/* + * Count the number of lines in provided text. All text has at least 1 line + * (even if @text_size is 0). Each '\n' processed is counted as an additional + * line. + */ +static unsigned int count_lines(char *text, unsigned int text_size) +{ + unsigned int next_size = text_size; + unsigned int line_count = 1; + char *next = text; + + while (next_size) { + next = memchr(next, '\n', next_size); + if (!next) + break; + line_count++; + next++; + next_size = text_size - (next - text); + } + + return line_count; +} + +/* + * Given @blk_lpos, copy an expected @len of data into the provided buffer. + * If @line_count is provided, count the number of lines in the data. + * + * This function (used by readers) performs strict validation on the data + * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static bool copy_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, + unsigned int buf_size, unsigned int *line_count) +{ + unsigned int data_size; + char *data; + + /* Caller might not want any data. */ + if ((!buf || !buf_size) && !line_count) + return true; + + data = get_data(data_ring, blk_lpos, &data_size); + if (!data) + return false; + + /* + * Actual cannot be less than expected. It can be more than expected + * because of the trailing alignment padding. + */ + if (WARN_ON_ONCE(data_size < (unsigned int)len)) { + pr_warn_once("wrong data size (%u, expecting %hu) for data: %.*s\n", + data_size, len, data_size, data); + return false; + } + + /* Caller interested in the line count? */ + if (line_count) + *line_count = count_lines(data, data_size); + + /* Caller interested in the data content? */ + if (!buf || !buf_size) + return true; + + data_size = min_t(u16, buf_size, len); + + if (!WARN_ON_ONCE(!data_size)) + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + return true; +} + +/* + * This is an extended version of desc_read(). It gets a copy of a specified + * descriptor. However, it also verifies that the record is committed and has + * the sequence number @seq. On success, 0 is returned. + * + * Error return values: + * -EINVAL: A committed record with sequence number @seq does not exist. + * -ENOENT: A committed record with sequence number @seq exists, but its data + * is not available. This is a valid record, so readers should + * continue with the next record. + */ +static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, + unsigned long id, u64 seq, + struct prb_desc *desc_out) +{ + struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; + enum desc_state d_state; + + d_state = desc_read(desc_ring, id, desc_out); + + /* + * An unexpected @id (desc_miss) or @seq mismatch means the record + * does not exist. A descriptor in the reserved state means the + * record does not yet exist for the reader. + */ + if (d_state == desc_miss || + d_state == desc_reserved || + desc_out->info.seq != seq) { + return -EINVAL; + } + + /* + * A descriptor in the reusable state may no longer have its data + * available; report it as a data-less record. Or the record may + * actually be a data-less record. + */ + if (d_state == desc_reusable || + (blk_lpos->begin == INVALID_LPOS && blk_lpos->next == INVALID_LPOS)) { + return -ENOENT; + } + + return 0; +} + +/* + * Copy the ringbuffer data from the record with @seq to the provided + * @r buffer. On success, 0 is returned. + * + * See desc_read_committed_seq() for error return values. + */ +static int prb_read(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r, unsigned int *line_count) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_desc *rdesc = to_desc(desc_ring, seq); + atomic_long_t *state_var = &rdesc->state_var; + struct prb_desc desc; + unsigned long id; + int err; + + /* Extract the ID, used to specify the descriptor to read. */ + id = DESC_ID(atomic_long_read(state_var)); + + /* Get a local copy of the correct descriptor (if available). */ + err = desc_read_committed_seq(desc_ring, id, seq, &desc); + + /* + * If @r is NULL, the caller is only interested in the availability + * of the record. + */ + if (err || !r) + return err; + + /* If requested, copy meta data. */ + if (r->info) + memcpy(r->info, &desc.info, sizeof(*(r->info))); + + /* Copy text data. If it fails, this is a data-less record. */ + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, desc.info.text_len, + r->text_buf, r->text_buf_size, line_count)) { + return -ENOENT; + } + + /* + * Copy dict data. Although this should not fail, dict data is not + * important. So if it fails, modify the copied meta data to report + * that there is no dict data, thus silently dropping the dict data. + */ + if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, desc.info.dict_len, + r->dict_buf, r->dict_buf_size, NULL)) { + if (r->info) + r->info->dict_len = 0; + } + + /* Ensure the record is still committed and has the same @seq. */ + return desc_read_committed_seq(desc_ring, id, seq, &desc); +} + +/* Get the sequence number of the tail descriptor. */ +static u64 prb_first_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + + for (;;) { + id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ + + d_state = desc_read(desc_ring, id, &desc); /* LMM(prb_first_seq:B) */ + + /* + * This loop will not be infinite because the tail is + * _always_ in the committed or reusable state. + */ + if (d_state == desc_committed || d_state == desc_reusable) + break; + + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail in the case + * that the descriptor has been recycled. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_first_seq:B reads from desc_reserve:F, then + * prb_first_seq:A reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB prb_first_seq:B to prb_first_seq:A + */ + smp_rmb(); /* LMM(prb_first_seq:C) */ + } + + return desc.info.seq; +} + +/* + * Non-blocking read of a record. Updates @seq to the last committed record + * (which may have no data). + * + * See the description of prb_read_valid() and prb_read_valid_info() + * for details. + */ +static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, + struct printk_record *r, unsigned int *line_count) +{ + u64 tail_seq; + int err; + + while ((err = prb_read(rb, *seq, r, line_count))) { + tail_seq = prb_first_seq(rb); + + if (*seq < tail_seq) { + /* + * Behind the tail. Catch up and try again. This + * can happen for -ENOENT and -EINVAL cases. + */ + *seq = tail_seq; + + } else if (err == -ENOENT) { + /* Record exists, but no data available. Skip. */ + (*seq)++; + + } else { + /* Non-existent/non-committed record. Must stop. */ + return false; + } + } + + return true; +} + +/** + * prb_read_valid() - Non-blocking read of a requested record or (if gone) + * the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @r: A record data buffer to store the read record to. + * + * This is the public function available to readers to read a record. + * + * The reader provides the @info, @text_buf, @dict_buf buffers of @r to be + * filled in. Any of the buffer pointers can be set to NULL if the reader + * is not interested in that data. To ensure proper initialization of @r, + * prb_rec_init_rd() should be used. + * + * Context: Any context. + * Return: true if a record was read, otherwise false. + * + * On success, the reader must check r->info.seq to see which record was + * actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r) +{ + return _prb_read_valid(rb, &seq, r, NULL); +} + +/** + * prb_read_valid_info() - Non-blocking read of meta data for a requested + * record or (if gone) the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @info: A buffer to store the read record meta data to. + * @line_count: A buffer to store the number of lines in the record text. + * + * This is the public function available to readers to read only the + * meta data of a record. + * + * The reader provides the @info, @line_count buffers to be filled in. + * Either of the buffer pointers can be set to NULL if the reader is not + * interested in that data. + * + * Context: Any context. + * Return: true if a record's meta data was read, otherwise false. + * + * On success, the reader must check info->seq to see which record meta data + * was actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count) +{ + struct printk_record r; + + prb_rec_init_rd(&r, info, NULL, 0, NULL, 0); + + return _prb_read_valid(rb, &seq, &r, line_count); +} + +/** + * prb_first_valid_seq() - Get the sequence number of the oldest available + * record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the + * first/oldest valid sequence number is. + * + * This provides readers a starting point to begin iterating the ringbuffer. + * + * Context: Any context. + * Return: The sequence number of the first/oldest record or, if the + * ringbuffer is empty, 0 is returned. + */ +u64 prb_first_valid_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + if (!_prb_read_valid(rb, &seq, NULL, NULL)) + return 0; + + return seq; +} + +/** + * prb_next_seq() - Get the sequence number after the last available record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the next + * newest sequence number available to readers will be. + * + * This provides readers a sequence number to jump to if all currently + * available records should be skipped. + * + * Context: Any context. + * Return: The sequence number of the next newest (not yet available) record + * for readers. + */ +u64 prb_next_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + /* Search forward from the oldest descriptor. */ + while (_prb_read_valid(rb, &seq, NULL, NULL)) + seq++; + + return seq; +} + +/** + * prb_init() - Initialize a ringbuffer to use provided external buffers. + * + * @rb: The ringbuffer to initialize. + * @text_buf: The data buffer for text data. + * @textbits: The size of @text_buf as a power-of-2 value. + * @dict_buf: The data buffer for dictionary data. + * @dictbits: The size of @dict_buf as a power-of-2 value. + * @descs: The descriptor buffer for ringbuffer records. + * @descbits: The count of @descs items as a power-of-2 value. + * + * This is the public function available to writers to setup a ringbuffer + * during runtime using provided buffers. + * + * This must match the initialization of DEFINE_PRINTKRB(). + * + * Context: Any context. + */ +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int textbits, + char *dict_buf, unsigned int dictbits, + struct prb_desc *descs, unsigned int descbits) +{ + memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + + rb->desc_ring.count_bits = descbits; + rb->desc_ring.descs = descs; + atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + + rb->text_data_ring.size_bits = textbits; + rb->text_data_ring.data = text_buf; + atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); + atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); + + rb->dict_data_ring.size_bits = dictbits; + rb->dict_data_ring.data = dict_buf; + atomic_long_set(&rb->dict_data_ring.head_lpos, BLK0_LPOS(dictbits)); + atomic_long_set(&rb->dict_data_ring.tail_lpos, BLK0_LPOS(dictbits)); + + atomic_long_set(&rb->fail, 0); + + descs[0].info.seq = -(u64)_DESCS_COUNT(descbits); + + descs[_DESCS_COUNT(descbits) - 1].info.seq = 0; + atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = INVALID_LPOS; +} + +/** + * prb_record_text_space() - Query the full actual used ringbuffer space for + * the text data of a reserved entry. + * + * @e: The successfully reserved entry to query. + * + * This is the public function available to writers to see how much actual + * space is used in the ringbuffer to store the text data of the specified + * entry. + * + * This function is only valid if @e has been successfully reserved using + * prb_reserve(). + * + * Context: Any context. + * Return: The size in bytes used by the text data of the associated record. + */ +unsigned int prb_record_text_space(struct prb_reserved_entry *e) +{ + return e->text_space; +} diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h new file mode 100644 index 000000000000..3e46a7423c13 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.h @@ -0,0 +1,399 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_PRINTK_RINGBUFFER_H +#define _KERNEL_PRINTK_RINGBUFFER_H + +#include + +/* + * Meta information about each stored message. + * + * All fields are set and used by the printk code except for + * @seq, @text_len, @dict_len, which are set and/or modified + * by the ringbuffer code. + */ +struct printk_info { + u64 seq; /* sequence number */ + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 text_len; /* length of text message */ + u16 dict_len; /* length of dictionary message */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ + u32 caller_id; /* thread id or processor id */ +}; + +/* + * A structure providing the buffers, used by writers and readers. + * + * Writers: + * Using prb_rec_init_wr(), a writer sets @text_buf_size and @dict_buf_size + * before calling prb_reserve(). On success, prb_reserve() sets @info, + * @text_buf, @dict_buf to buffers reserved for that writer. + * + * Readers: + * Using prb_rec_init_rd(), a reader sets all fields before calling + * prb_read_valid(). Note that the reader provides the @info, @text_buf, + * @dict_buf buffers. On success, the struct pointed to by @info will be + * filled and the char arrays pointed to by @text_buf and @dict_buf will + * be filled with text and dict data. + */ +struct printk_record { + struct printk_info *info; + char *text_buf; + char *dict_buf; + unsigned int text_buf_size; + unsigned int dict_buf_size; +}; + +/* Specifies the logical position and span of a data block. */ +struct prb_data_blk_lpos { + unsigned long begin; + unsigned long next; +}; + +/* + * A descriptor: the complete meta-data for a record. + * + * @state_var: A bitwise combination of descriptor ID and descriptor state. + */ +struct prb_desc { + struct printk_info info; + atomic_long_t state_var; + struct prb_data_blk_lpos text_blk_lpos; + struct prb_data_blk_lpos dict_blk_lpos; +}; + +/* A ringbuffer of "ID + data" elements. */ +struct prb_data_ring { + unsigned int size_bits; + char *data; + atomic_long_t head_lpos; + atomic_long_t tail_lpos; +}; + +/* A ringbuffer of "struct prb_desc" elements. */ +struct prb_desc_ring { + unsigned int count_bits; + struct prb_desc *descs; + atomic_long_t head_id; + atomic_long_t tail_id; +}; + +/* + * The high level structure representing the printk ringbuffer. + * + * @fail: Count of failed prb_reserve() calls where not even a data-less + * record was created. + */ +struct printk_ringbuffer { + struct prb_desc_ring desc_ring; + struct prb_data_ring text_data_ring; + struct prb_data_ring dict_data_ring; + atomic_long_t fail; +}; + +/* + * Used by writers as a reserve/commit handle. + * + * @rb: Ringbuffer where the entry is reserved. + * @irqflags: Saved irq flags to restore on entry commit. + * @id: ID of the reserved descriptor. + * @text_space: Total occupied buffer space in the text data ring, including + * ID, alignment padding, and wrapping data blocks. + * + * This structure is an opaque handle for writers. Its contents are only + * to be used by the ringbuffer implementation. + */ +struct prb_reserved_entry { + struct printk_ringbuffer *rb; + unsigned long irqflags; + unsigned long id; + unsigned int text_space; +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_COMMITTED_MASK (1UL << (DESC_SV_BITS - 1)) +#define DESC_REUSE_MASK (1UL << (DESC_SV_BITS - 2)) +#define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define INVALID_LPOS 1 + +#define INVALID_BLK_LPOS \ +{ \ + .begin = INVALID_LPOS, \ + .next = INVALID_LPOS, \ +} + +/* + * Descriptor Bootstrap + * + * The descriptor array is minimally initialized to allow immediate usage + * by readers and writers. The requirements that the descriptor array + * initialization must satisfy: + * + * Req1 + * The tail must point to an existing (committed or reusable) descriptor. + * This is required by the implementation of prb_first_seq(). + * + * Req2 + * Readers must see that the ringbuffer is initially empty. + * + * Req3 + * The first record reserved by a writer is assigned sequence number 0. + * + * To satisfy Req1, the tail initially points to a descriptor that is + * minimally initialized (having no data block, i.e. data-less with the + * data block's lpos @begin and @next values set to INVALID_LPOS). + * + * To satisfy Req2, the initial tail descriptor is initialized to the + * reusable state. Readers recognize reusable descriptors as existing + * records, but skip over them. + * + * To satisfy Req3, the last descriptor in the array is used as the initial + * head (and tail) descriptor. This allows the first record reserved by a + * writer (head + 1) to be the first descriptor in the array. (Only the first + * descriptor in the array could have a valid sequence number of 0.) + * + * The first time a descriptor is reserved, it is assigned a sequence number + * with the value of the array index. A "first time reserved" descriptor can + * be recognized because it has a sequence number of 0 but does not have an + * index of 0. (Only the first descriptor in the array could have a valid + * sequence number of 0.) After the first reservation, all future reservations + * (recycling) simply involve incrementing the sequence number by the array + * count. + * + * Hack #1 + * Only the first descriptor in the array is allowed to have the sequence + * number 0. In this case it is not possible to recognize if it is being + * reserved the first time (set to index value) or has been reserved + * previously (increment by the array count). This is handled by _always_ + * incrementing the sequence number by the array count when reserving the + * first descriptor in the array. In order to satisfy Req3, the sequence + * number of the first descriptor in the array is initialized to minus + * the array count. Then, upon the first reservation, it is incremented + * to 0, thus satisfying Req3. + * + * Hack #2 + * prb_first_seq() can be called at any time by readers to retrieve the + * sequence number of the tail descriptor. However, due to Req2 and Req3, + * initially there are no records to report the sequence number of + * (sequence numbers are u64 and there is nothing less than 0). To handle + * this, the sequence number of the initial tail descriptor is initialized + * to 0. Technically this is incorrect, because there is no record with + * sequence number 0 (yet) and the tail descriptor is not the first + * descriptor in the array. But it allows prb_read_valid() to correctly + * report the existence of a record for _any_ given sequence number at all + * times. Bootstrapping is complete when the tail is pushed the first + * time, thus finally pointing to the first descriptor reserved by a + * writer, which has the assigned sequence number 0. + */ + +/* + * Initiating Logical Value Overflows + * + * Both logical position (lpos) and ID values can be mapped to array indexes + * but may experience overflows during the lifetime of the system. To ensure + * that printk_ringbuffer can handle the overflows for these types, initial + * values are chosen that map to the correct initial array indexes, but will + * result in overflows soon. + * + * BLK0_LPOS + * The initial @head_lpos and @tail_lpos for data rings. It is at index + * 0 and the lpos value is such that it will overflow on the first wrap. + * + * DESC0_ID + * The initial @head_id and @tail_id for the desc ring. It is at the last + * index of the descriptor array (see Req3 above) and the ID value is such + * that it will overflow on the second wrap. + */ +#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) +#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) +#define DESC0_SV(ct_bits) (DESC_COMMITTED_MASK | DESC_REUSE_MASK | DESC0_ID(ct_bits)) + +/* + * Define a ringbuffer with an external text data buffer. The same as + * DEFINE_PRINTKRB() but requires specifying an external buffer for the + * text data. + * + * Note: The specified external buffer must be of the size: + * 2 ^ (descbits + avgtextbits) + */ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, text_buf) \ +static char _##name##_dict[1U << ((avgdictbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + .info = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + .info = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ + /* reusable */ \ + .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ + /* no associated data block */ \ + .text_blk_lpos = INVALID_BLK_LPOS, \ + .dict_blk_lpos = INVALID_BLK_LPOS, \ + }, \ +}; \ +static struct printk_ringbuffer name = { \ + .desc_ring = { \ + .count_bits = descbits, \ + .descs = &_##name##_descs[0], \ + .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + }, \ + .text_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = text_buf, \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .dict_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = &_##name##_dict[0], \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .fail = ATOMIC_LONG_INIT(0), \ +} + +/** + * DEFINE_PRINTKRB() - Define a ringbuffer. + * + * @name: The name of the ringbuffer variable. + * @descbits: The number of descriptors as a power-of-2 value. + * @avgtextbits: The average text data size per record as a power-of-2 value. + * @avgdictbits: The average dictionary data size per record as a + * power-of-2 value. + * + * This is a macro for defining a ringbuffer and all internal structures + * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a + * variant where the text data buffer can be specified externally. + */ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits) \ +static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +_DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, &_##name##_text[0]) + +/* Writer Interface */ + +/** + * prb_rec_init_wd() - Initialize a buffer for writing records. + * + * @r: The record to initialize. + * @text_buf_size: The needed text buffer size. + * @dict_buf_size: The needed dictionary buffer size. + * + * Initialize all the fields that a writer is interested in. If + * @dict_buf_size is 0, a dictionary buffer will not be reserved. + * @text_buf_size must be greater than 0. + * + * Note that although @dict_buf_size may be initialized to non-zero, + * its value must be rechecked after a successful call to prb_reserve() + * to verify a dictionary buffer was actually reserved. Dictionary buffer + * reservation is allowed to fail. + */ +static inline void prb_rec_init_wr(struct printk_record *r, + unsigned int text_buf_size, + unsigned int dict_buf_size) +{ + r->info = NULL; + r->text_buf = NULL; + r->dict_buf = NULL; + r->text_buf_size = text_buf_size; + r->dict_buf_size = dict_buf_size; +} + +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r); +void prb_commit(struct prb_reserved_entry *e); + +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int text_buf_size, + char *dict_buf, unsigned int dict_buf_size, + struct prb_desc *descs, unsigned int descs_count_bits); +unsigned int prb_record_text_space(struct prb_reserved_entry *e); + +/* Reader Interface */ + +/** + * prb_rec_init_rd() - Initialize a buffer for reading records. + * + * @r: The record to initialize. + * @info: A buffer to store record meta-data. + * @text_buf: A buffer to store text data. + * @text_buf_size: The size of @text_buf. + * @dict_buf: A buffer to store dictionary data. + * @dict_buf_size: The size of @dict_buf. + * + * Initialize all the fields that a reader is interested in. All arguments + * (except @r) are optional. Only record data for arguments that are + * non-NULL or non-zero will be read. + */ +static inline void prb_rec_init_rd(struct printk_record *r, + struct printk_info *info, + char *text_buf, unsigned int text_buf_size, + char *dict_buf, unsigned int dict_buf_size) +{ + r->info = info; + r->text_buf = text_buf; + r->dict_buf = dict_buf; + r->text_buf_size = text_buf_size; + r->dict_buf_size = dict_buf_size; +} + +/** + * prb_for_each_record() - Iterate over the records of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @r: A printk_record to store the record on each iteration. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_record(from, rb, s, r) \ +for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) + +/** + * prb_for_each_info() - Iterate over the meta data of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @i: A printk_info to store the record meta data on each iteration. + * @lc: An unsigned int to store the text line count of each record. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_info(from, rb, s, i, lc) \ +for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) + +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r); +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count); + +u64 prb_first_valid_seq(struct printk_ringbuffer *rb); +u64 prb_next_seq(struct printk_ringbuffer *rb); + +#endif /* _KERNEL_PRINTK_RINGBUFFER_H */ -- cgit v1.2.3 From 8749efc0c0c325bf0c948c0b11d77bd3e497ead5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 9 Jul 2020 15:29:43 +0206 Subject: Revert "printk: lock/unlock console only for new logbuf entries" This reverts commit 3ac37a93fa9217e576bebfd4ba3e80edaaeb2289. This optimization will not apply once the transition to a lockless printk is complete. Rather than porting this optimization through the transition only to remove it anyway, just revert it now to simplify the transition. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200709132344.760-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b71eaf5f5a86..1b41e1b98221 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1981,9 +1981,8 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; - bool in_sched = false, pending_output; + bool in_sched = false; unsigned long flags; - u64 curr_log_seq; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -1999,13 +1998,11 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - curr_log_seq = log_next_seq; printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); - pending_output = (curr_log_seq != log_next_seq); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && pending_output) { + if (!in_sched) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -2022,8 +2019,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (pending_output) - wake_up_klogd(); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.2.3 From 896fbe20b4e2333fb55cc9b9b783ebcc49eee7c7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 9 Jul 2020 15:29:44 +0206 Subject: printk: use the lockless ringbuffer Replace the existing ringbuffer usage and implementation with lockless ringbuffer usage. Even though the new ringbuffer does not require locking, all existing locking is left in place. Therefore, this change is purely replacing the underlining ringbuffer. Changes that exist due to the ringbuffer replacement: - The VMCOREINFO has been updated for the new structures. - Dictionary data is now stored in a separate data buffer from the human-readable messages. The dictionary data buffer is set to the same size as the message buffer. Therefore, the total required memory for both dictionary and message data is 2 * (2 ^ CONFIG_LOG_BUF_SHIFT) for the initial static buffers and 2 * log_buf_len (the kernel parameter) for the dynamic buffers. - Record meta-data is now stored in a separate array of descriptors. This is an additional 72 * (2 ^ (CONFIG_LOG_BUF_SHIFT - 5)) bytes for the static array and 72 * (log_buf_len >> 5) bytes for the dynamic array. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200709132344.760-5-john.ogness@linutronix.de --- kernel/printk/printk.c | 940 ++++++++++++++++++++++++++----------------------- 1 file changed, 493 insertions(+), 447 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1b41e1b98221..fec71229169e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #define CREATE_TRACE_POINTS #include +#include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" @@ -294,30 +295,24 @@ enum con_msg_format_flags { static int console_msg_format = MSG_FORMAT_DEFAULT; /* - * The printk log buffer consists of a chain of concatenated variable - * length records. Every record starts with a record header, containing - * the overall length of the record. + * The printk log buffer consists of a sequenced collection of records, each + * containing variable length message and dictionary text. Every record + * also contains its own meta-data (@info). * - * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these entries are maintained when messages are - * stored. + * Every record meta-data carries the timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual kernel + * messages use LOG_KERN; userspace-injected messages always carry a matching + * syslog facility, by default LOG_USER. The origin of every message can be + * reliably determined that way. * - * If the heads indicate available messages, the length in the header - * tells the start next message. A length == 0 for the next message - * indicates a wrap-around to the beginning of the buffer. + * The human readable log message of a record is available in @text, the + * length of the message text in @text_len. The stored message is not + * terminated. * - * Every record carries the monotonic timestamp in microseconds, as well as - * the standard userspace syslog level and syslog facility. The usual - * kernel messages use LOG_KERN; userspace-injected messages always carry - * a matching syslog facility, by default LOG_USER. The origin of every - * message can be reliably determined that way. - * - * The human readable log message directly follows the message header. The - * length of the message text is stored in the header, the stored message - * is not terminated. - * - * Optionally, a message can carry a dictionary of properties (key/value pairs), - * to provide userspace with a machine-readable message context. + * Optionally, a record can carry a dictionary of properties (key/value + * pairs), to provide userspace with a machine-readable message context. The + * length of the dictionary is available in @dict_len. The dictionary is not + * terminated. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -331,21 +326,19 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * follows directly after a '=' character. Every property is terminated by * a '\0' character. The last property is not terminated. * - * Example of a message structure: - * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec - * 0008 34 00 record is 52 bytes long - * 000a 0b 00 text is 11 bytes long - * 000c 1f 00 dictionary is 23 bytes long - * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) - * 0010 69 74 27 73 20 61 20 6c "it's a l" - * 69 6e 65 "ine" - * 001b 44 45 56 49 43 "DEVIC" - * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" - * 52 49 56 45 52 3d 62 75 "RIVER=bu" - * 67 "g" - * 0032 00 00 00 padding to next message header - * - * The 'struct printk_log' buffer header must never be directly exported to + * Example of record values: + * record.text_buf = "it's a line" (unterminated) + * record.dict_buf = "DEVICE=b8:2\0DRIVER=bug" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.dict_len = 22 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * + * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -365,23 +358,6 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct printk_log { - u64 ts_nsec; /* timestamp in nanoseconds */ - u16 len; /* length of entire record */ - u16 text_len; /* length of text buffer */ - u16 dict_len; /* length of dictionary buffer */ - u8 facility; /* syslog facility */ - u8 flags:5; /* internal record flags */ - u8 level:3; /* syslog level */ -#ifdef CONFIG_PRINTK_CALLER - u32 caller_id; /* thread id or processor id */ -#endif -} -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -__packed __aligned(4) -#endif -; - /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken * within the scheduler's rq lock. It must be released before calling @@ -421,26 +397,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; -static u32 syslog_idx; static size_t syslog_partial; static bool syslog_time; -/* index and sequence number of the first record stored in the buffer */ -static u64 log_first_seq; -static u32 log_first_idx; - -/* index and sequence number of the next record to store in the buffer */ -static u64 log_next_seq; -static u32 log_next_idx; - /* the next printk record to write to the console */ static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; +static unsigned long console_dropped; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; -static u32 clear_idx; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 @@ -453,13 +419,31 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#define LOG_ALIGN __alignof__(struct printk_log) +#define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* + * Define the average message size. This only affects the number of + * descriptors that will be available. Underestimating is better than + * overestimating (too many available descriptors is better than not enough). + * The dictionary buffer will be the same size as the text buffer. + */ +#define PRB_AVGBITS 5 /* 32 character average length */ + +#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS +#error CONFIG_LOG_BUF_SHIFT value too small. +#endif +_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, + PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); + +static struct printk_ringbuffer printk_rb_dynamic; + +static struct printk_ringbuffer *prb = &printk_rb_static; + /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when @@ -484,108 +468,6 @@ u32 log_buf_len_get(void) return log_buf_len; } -/* human readable text of the record */ -static char *log_text(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log); -} - -/* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log) + msg->text_len; -} - -/* get record by index; idx must point to valid msg */ -static struct printk_log *log_from_idx(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer. - */ - if (!msg->len) - return (struct printk_log *)log_buf; - return msg; -} - -/* get next record; idx must point to valid msg */ -static u32 log_next(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* length == 0 indicates the end of the buffer; wrap */ - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer as *this* one, and - * return the one after that. - */ - if (!msg->len) { - msg = (struct printk_log *)log_buf; - return msg->len; - } - return idx + msg->len; -} - -/* - * Check whether there is enough free space for the given message. - * - * The same values of first_idx and next_idx mean that the buffer - * is either empty or full. - * - * If the buffer is empty, we must respect the position of the indexes. - * They cannot be reset to the beginning of the buffer. - */ -static int logbuf_has_space(u32 msg_size, bool empty) -{ - u32 free; - - if (log_next_idx > log_first_idx || empty) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - /* - * We need space also for an empty header that signalizes wrapping - * of the buffer. - */ - return free >= msg_size + sizeof(struct printk_log); -} - -static int log_make_free_space(u32 msg_size) -{ - while (log_first_seq < log_next_seq && - !logbuf_has_space(msg_size, false)) { - /* drop old messages until we have enough contiguous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } - - if (clear_seq < log_first_seq) { - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - - /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) - return 0; - - return -ENOMEM; -} - -/* compute the message size including the padding bytes */ -static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) -{ - u32 size; - - size = sizeof(struct printk_log) + text_len + dict_len; - *pad_len = (-size) & (LOG_ALIGN - 1); - size += *pad_len; - - return size; -} - /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available @@ -594,22 +476,23 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = ""; -static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, - u16 *dict_len, u32 *pad_len) +static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) *text_len = max_text_len; - /* enable the warning message */ + + /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); - /* disable the "dict" completely */ - *dict_len = 0; - /* compute the size again, count also the warning message */ - return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); + if (*text_len >= *trunc_msg_len) + *text_len -= *trunc_msg_len; + else + *trunc_msg_len = 0; } /* insert record into the buffer, discard old ones, update heads */ @@ -618,60 +501,40 @@ static int log_store(u32 caller_id, int facility, int level, const char *dict, u16 dict_len, const char *text, u16 text_len) { - struct printk_log *msg; - u32 size, pad_len; + struct prb_reserved_entry e; + struct printk_record r; u16 trunc_msg_len = 0; - /* number of '\0' padding bytes to next message */ - size = msg_used_size(text_len, dict_len, &pad_len); + prb_rec_init_wr(&r, text_len, dict_len); - if (log_make_free_space(size)) { + if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ - size = truncate_msg(&text_len, &trunc_msg_len, - &dict_len, &pad_len); + truncate_msg(&text_len, &trunc_msg_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len, dict_len); /* survive when the log buffer is too small for trunc_msg */ - if (log_make_free_space(size)) + if (!prb_reserve(&e, prb, &r)) return 0; } - if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { - /* - * This message + an additional empty header does not fit - * at the end of the buffer. Add an empty header with len == 0 - * to signify a wrap around. - */ - memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); - log_next_idx = 0; - } - /* fill message */ - msg = (struct printk_log *)(log_buf + log_next_idx); - memcpy(log_text(msg), text, text_len); - msg->text_len = text_len; - if (trunc_msg_len) { - memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); - msg->text_len += trunc_msg_len; - } - memcpy(log_dict(msg), dict, dict_len); - msg->dict_len = dict_len; - msg->facility = facility; - msg->level = level & 7; - msg->flags = flags & 0x1f; + memcpy(&r.text_buf[0], text, text_len); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + if (r.dict_buf) + memcpy(&r.dict_buf[0], dict, dict_len); + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = flags & 0x1f; if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; + r.info->ts_nsec = ts_nsec; else - msg->ts_nsec = local_clock(); -#ifdef CONFIG_PRINTK_CALLER - msg->caller_id = caller_id; -#endif - memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = size; + r.info->ts_nsec = local_clock(); + r.info->caller_id = caller_id; /* insert message */ - log_next_idx += msg->len; - log_next_seq++; + prb_commit(&e); - return msg->text_len; + return (text_len + trunc_msg_len); } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); @@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c) *(*pp)++ = c; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq) +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) { - u64 ts_usec = msg->ts_nsec; + u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER - u32 id = msg->caller_id; + u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); @@ -740,8 +603,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-', caller); + (info->facility << 3) | info->level, info->seq, + ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -795,10 +658,14 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; - u32 idx; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; + + struct printk_info info; + char text_buf[CONSOLE_EXT_LOG_MAX]; + char dict_buf[CONSOLE_EXT_LOG_MAX]; + struct printk_record record; }; static __printf(3, 4) __cold @@ -881,7 +748,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct printk_log *msg; + struct printk_record *r = &user->record; size_t len; ssize_t ret; @@ -893,7 +760,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, return ret; logbuf_lock_irq(); - while (user->seq == log_next_seq) { + if (!prb_read_valid(prb, user->seq, r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; logbuf_unlock_irq(); @@ -902,30 +769,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, - user->seq != log_next_seq); + prb_read_valid(prb, user->seq, r)); if (ret) goto out; logbuf_lock_irq(); } - if (user->seq < log_first_seq) { + if (user->seq < prb_first_valid_seq(prb)) { /* our last seen message is gone, return error and reset */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); ret = -EPIPE; logbuf_unlock_irq(); goto out; } - msg = log_from_idx(user->idx); - len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq); + len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r->dict_buf[0], r->info->dict_len, + &r->text_buf[0], r->info->text_len); - user->idx = log_next(user->idx); - user->seq++; + user->seq = r->info->seq + 1; logbuf_unlock_irq(); if (len > count) { @@ -957,8 +820,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: /* the first record */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); break; case SEEK_DATA: /* @@ -966,13 +828,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->idx = clear_idx; user->seq = clear_seq; break; case SEEK_END: /* after the last record */ - user->idx = log_next_idx; - user->seq = log_next_seq; + user->seq = prb_next_seq(prb); break; default: ret = -EINVAL; @@ -992,9 +852,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); logbuf_lock_irq(); - if (user->seq < log_next_seq) { + if (prb_read_valid(prb, user->seq, NULL)) { /* return error when data has vanished underneath us */ - if (user->seq < log_first_seq) + if (user->seq < prb_first_valid_seq(prb)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -1029,9 +889,12 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf), + &user->dict_buf[0], sizeof(user->dict_buf)); + logbuf_lock_irq(); - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); logbuf_unlock_irq(); file->private_data = user; @@ -1072,23 +935,52 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(log_first_idx); - VMCOREINFO_SYMBOL(clear_idx); - VMCOREINFO_SYMBOL(log_next_idx); + VMCOREINFO_SYMBOL(prb); + VMCOREINFO_SYMBOL(printk_rb_static); + VMCOREINFO_SYMBOL(clear_seq); + /* - * Export struct printk_log size and field offsets. User space tools can + * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(printk_log); - VMCOREINFO_OFFSET(printk_log, ts_nsec); - VMCOREINFO_OFFSET(printk_log, len); - VMCOREINFO_OFFSET(printk_log, text_len); - VMCOREINFO_OFFSET(printk_log, dict_len); -#ifdef CONFIG_PRINTK_CALLER - VMCOREINFO_OFFSET(printk_log, caller_id); -#endif + + VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); + VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, dict_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, fail); + + VMCOREINFO_STRUCT_SIZE(prb_desc_ring); + VMCOREINFO_OFFSET(prb_desc_ring, count_bits); + VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, head_id); + VMCOREINFO_OFFSET(prb_desc_ring, tail_id); + + VMCOREINFO_STRUCT_SIZE(prb_desc); + VMCOREINFO_OFFSET(prb_desc, info); + VMCOREINFO_OFFSET(prb_desc, state_var); + VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); + VMCOREINFO_OFFSET(prb_desc, dict_blk_lpos); + + VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); + VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); + VMCOREINFO_OFFSET(prb_data_blk_lpos, next); + + VMCOREINFO_STRUCT_SIZE(printk_info); + VMCOREINFO_OFFSET(printk_info, seq); + VMCOREINFO_OFFSET(printk_info, ts_nsec); + VMCOREINFO_OFFSET(printk_info, text_len); + VMCOREINFO_OFFSET(printk_info, dict_len); + VMCOREINFO_OFFSET(printk_info, caller_id); + + VMCOREINFO_STRUCT_SIZE(prb_data_ring); + VMCOREINFO_OFFSET(prb_data_ring, size_bits); + VMCOREINFO_OFFSET(prb_data_ring, data); + VMCOREINFO_OFFSET(prb_data_ring, head_lpos); + VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); } #endif @@ -1166,11 +1058,48 @@ static void __init set_percpu_data_ready(void) __printk_percpu_data_ready = true; } +static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_reserved_entry e; + struct printk_record dest_r; + + prb_rec_init_wr(&dest_r, r->info->text_len, r->info->dict_len); + + if (!prb_reserve(&e, rb, &dest_r)) + return 0; + + memcpy(&dest_r.text_buf[0], &r->text_buf[0], dest_r.text_buf_size); + if (dest_r.dict_buf) { + memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], + dest_r.dict_buf_size); + } + dest_r.info->facility = r->info->facility; + dest_r.info->level = r->info->level; + dest_r.info->flags = r->info->flags; + dest_r.info->ts_nsec = r->info->ts_nsec; + dest_r.info->caller_id = r->info->caller_id; + + prb_commit(&e); + + return prb_record_text_space(&e); +} + +static char setup_text_buf[CONSOLE_EXT_LOG_MAX] __initdata; +static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] __initdata; + void __init setup_log_buf(int early) { + unsigned int new_descs_count; + struct prb_desc *new_descs; + struct printk_info info; + struct printk_record r; + size_t new_descs_size; unsigned long flags; + char *new_dict_buf; char *new_log_buf; unsigned int free; + u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very @@ -1189,21 +1118,70 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; + new_descs_count = new_log_buf_len >> PRB_AVGBITS; + if (new_descs_count == 0) { + pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); + return; + } + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %lu bytes not available\n", - new_log_buf_len); + pr_err("log_buf_len: %lu text bytes not available\n", + new_log_buf_len); return; } + new_dict_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); + if (unlikely(!new_dict_buf)) { + pr_err("log_buf_len: %lu dict bytes not available\n", + new_log_buf_len); + memblock_free(__pa(new_log_buf), new_log_buf_len); + return; + } + + new_descs_size = new_descs_count * sizeof(struct prb_desc); + new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); + if (unlikely(!new_descs)) { + pr_err("log_buf_len: %zu desc bytes not available\n", + new_descs_size); + memblock_free(__pa(new_dict_buf), new_log_buf_len); + memblock_free(__pa(new_log_buf), new_log_buf_len); + return; + } + + prb_rec_init_rd(&r, &info, + &setup_text_buf[0], sizeof(setup_text_buf), + &setup_dict_buf[0], sizeof(setup_dict_buf)); + + prb_init(&printk_rb_dynamic, + new_log_buf, ilog2(new_log_buf_len), + new_dict_buf, ilog2(new_log_buf_len), + new_descs, ilog2(new_descs_count)); + logbuf_lock_irqsave(flags); + log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_next_idx; - memcpy(log_buf, __log_buf, __LOG_BUF_LEN); + + free = __LOG_BUF_LEN; + prb_for_each_record(0, &printk_rb_static, seq, &r) + free -= add_to_rb(&printk_rb_dynamic, &r); + + /* + * This is early enough that everything is still running on the + * boot CPU and interrupts are disabled. So no new messages will + * appear during the transition to the dynamic buffer. + */ + prb = &printk_rb_dynamic; + logbuf_unlock_irqrestore(flags); + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); + } + pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); @@ -1313,18 +1291,18 @@ static size_t print_caller(u32 id, char *buf) #define print_caller(id, buf) 0 #endif -static size_t print_prefix(const struct printk_log *msg, bool syslog, - bool time, char *buf) +static size_t info_print_prefix(const struct printk_info *info, bool syslog, + bool time, char *buf) { size_t len = 0; if (syslog) - len = print_syslog((msg->facility << 3) | msg->level, buf); + len = print_syslog((info->facility << 3) | info->level, buf); if (time) - len += print_time(msg->ts_nsec, buf + len); + len += print_time(info->ts_nsec, buf + len); - len += print_caller(msg->caller_id, buf + len); + len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; @@ -1334,72 +1312,143 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) +/* + * Prepare the record for printing. The text is shifted within the given + * buffer to avoid a need for another one. The following operations are + * done: + * + * - Add prefix for each line. + * - Add the trailing newline that has been removed in vprintk_store(). + * - Drop truncated lines that do not longer fit into the buffer. + * + * Return: The length of the updated/prepared text, including the added + * prefixes and the newline. The dropped line(s) are not counted. + */ +static size_t record_print_text(struct printk_record *r, bool syslog, + bool time) { - const char *text = log_text(msg); - size_t text_size = msg->text_len; - size_t len = 0; + size_t text_len = r->info->text_len; + size_t buf_size = r->text_buf_size; + char *text = r->text_buf; char prefix[PREFIX_MAX]; - const size_t prefix_len = print_prefix(msg, syslog, time, prefix); + bool truncated = false; + size_t prefix_len; + size_t line_len; + size_t len = 0; + char *next; - do { - const char *next = memchr(text, '\n', text_size); - size_t text_len; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); + /* + * @text_len: bytes of unprocessed text + * @line_len: bytes of current line _without_ newline + * @text: pointer to beginning of current line + * @len: number of bytes prepared in r->text_buf + */ + for (;;) { + next = memchr(text, '\n', text_len); if (next) { - text_len = next - text; - next++; - text_size -= next - text; + line_len = next - text; } else { - text_len = text_size; + /* Drop truncated line(s). */ + if (truncated) + break; + line_len = text_len; } - if (buf) { - if (prefix_len + text_len + 1 >= size - len) + /* + * Truncate the text if there is not enough space to add the + * prefix and a trailing newline. + */ + if (len + prefix_len + text_len + 1 > buf_size) { + /* Drop even the current line if no space. */ + if (len + prefix_len + line_len + 1 > buf_size) break; - memcpy(buf + len, prefix, prefix_len); - len += prefix_len; - memcpy(buf + len, text, text_len); - len += text_len; - buf[len++] = '\n'; - } else { - /* SYSLOG_ACTION_* buffer size only calculation */ - len += prefix_len + text_len + 1; + text_len = buf_size - len - prefix_len - 1; + truncated = true; } - text = next; - } while (text); + memmove(text + prefix_len, text, text_len); + memcpy(text, prefix, prefix_len); + + len += prefix_len + line_len + 1; + + if (text_len == line_len) { + /* + * Add the trailing newline removed in + * vprintk_store(). + */ + text[prefix_len + line_len] = '\n'; + break; + } + + /* + * Advance beyond the added prefix and the related line with + * its newline. + */ + text += prefix_len + line_len + 1; + + /* + * The remaining text has only decreased by the line with its + * newline. + * + * Note that @text_len can become zero. It happens when @text + * ended with a newline (either due to truncation or the + * original string ending with "\n\n"). The loop is correctly + * repeated and (if not truncated) an empty line with a prefix + * will be prepared. + */ + text_len -= line_len + 1; + } return len; } +static size_t get_record_print_text_size(struct printk_info *info, + unsigned int line_count, + bool syslog, bool time) +{ + char prefix[PREFIX_MAX]; + size_t prefix_len; + + prefix_len = info_print_prefix(info, syslog, time, prefix); + + /* + * Each line will be preceded with a prefix. The intermediate + * newlines are already within the text, but a final trailing + * newline will be added. + */ + return ((prefix_len * line_count) + info->text_len + 1); +} + static int syslog_print(char __user *buf, int size) { + struct printk_info info; + struct printk_record r; char *text; - struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); + while (size > 0) { size_t n; size_t skip; logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_partial = 0; - } - if (syslog_seq == log_next_seq) { + if (!prb_read_valid(prb, syslog_seq, &r)) { logbuf_unlock_irq(); break; } + if (r.info->seq != syslog_seq) { + /* message is gone, move to next valid one */ + syslog_seq = r.info->seq; + syslog_partial = 0; + } /* * To keep reading/counting partial line consistent, @@ -1409,13 +1458,10 @@ static int syslog_print(char __user *buf, int size) syslog_time = printk_time; skip = syslog_partial; - msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, syslog_time, text, - LOG_LINE_MAX + PREFIX_MAX); + n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ - syslog_idx = log_next(syslog_idx); - syslog_seq++; + syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1446,11 +1492,12 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; char *text; int len = 0; - u64 next_seq; u64 seq; - u32 idx; bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1463,38 +1510,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear) * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; - } + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) + len += get_record_print_text_size(&info, line_count, true, time); /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { + if (len <= size) + break; + len -= get_record_print_text_size(&info, line_count, true, time); } - /* last message fitting into this dump */ - next_seq = log_next_seq; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen = msg_print_text(msg, true, time, text, - LOG_LINE_MAX + PREFIX_MAX); + prb_for_each_record(seq, prb, seq, &r) { + int textlen; - idx = log_next(idx); - seq++; + textlen = record_print_text(&r, true, time); + + if (len + textlen > size) { + seq--; + break; + } logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) @@ -1503,17 +1540,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len += textlen; logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (len < 0) + break; } - if (clear) { - clear_seq = log_next_seq; - clear_idx = log_next_idx; - } + if (clear) + clear_seq = seq; logbuf_unlock_irq(); kfree(text); @@ -1523,8 +1555,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; + clear_seq = prb_next_seq(prb); logbuf_unlock_irq(); } @@ -1551,7 +1582,7 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, - syslog_seq != log_next_seq); + prb_read_valid(prb, syslog_seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1600,10 +1631,9 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { + if (syslog_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; + syslog_seq = prb_first_valid_seq(prb); syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1612,20 +1642,18 @@ int do_syslog(int type, char __user *buf, int len, int source) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_seq - syslog_seq; + error = prb_next_seq(prb) - syslog_seq; } else { - u64 seq = syslog_seq; - u32 idx = syslog_idx; bool time = syslog_partial ? syslog_time : printk_time; - - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - error += msg_print_text(msg, true, time, NULL, - 0); + struct printk_info info; + unsigned int line_count; + u64 seq; + + prb_for_each_info(syslog_seq, prb, seq, &info, + &line_count) { + error += get_record_print_text_size(&info, line_count, + true, time); time = printk_time; - idx = log_next(idx); - seq++; } error -= syslog_partial; } @@ -1796,10 +1824,22 @@ static int console_trylock_spinning(void) static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { + static char dropped_text[64]; + size_t dropped_len = 0; struct console *con; trace_console_rcuidle(text, len); + if (!console_drivers) + return; + + if (console_dropped) { + dropped_len = snprintf(dropped_text, sizeof(dropped_text), + "** %lu printk messages dropped **\n", + console_dropped); + console_dropped = 0; + } + for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -1812,8 +1852,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, continue; if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len); - else + else { + if (dropped_len) + con->write(con, dropped_text, dropped_len); con->write(con, text, len); + } } } @@ -2076,21 +2119,24 @@ EXPORT_SYMBOL(printk); #define PREFIX_MAX 0 #define printk_time false +#define prb_read_valid(rb, seq, r) false +#define prb_first_valid_seq(rb) 0 + static u64 syslog_seq; -static u32 syslog_idx; static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; -static u64 log_first_seq; -static u32 log_first_idx; -static u64 log_next_seq; -static char *log_text(const struct printk_log *msg) { return NULL; } -static char *log_dict(const struct printk_log *msg) { return NULL; } -static struct printk_log *log_from_idx(u32 idx) { return NULL; } -static u32 log_next(u32 idx) { return 0; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, - u64 seq) { return 0; } +static unsigned long console_dropped; + +static size_t record_print_text(const struct printk_record *r, + bool syslog, bool time) +{ + return 0; +} +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) +{ + return 0; +} static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } @@ -2098,8 +2144,6 @@ static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2384,14 +2428,19 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; + static char dict[LOG_LINE_MAX]; unsigned long flags; bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; if (console_suspended) { up_console_sem(); return; } + prb_rec_init_rd(&r, &info, text, sizeof(text), dict, sizeof(dict)); + /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2422,35 +2471,26 @@ again: } for (;;) { - struct printk_log *msg; size_t ext_len = 0; size_t len; printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (console_seq < log_first_seq) { - len = snprintf(text, sizeof(text), - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); - - /* messages are gone, move to first one */ - console_seq = log_first_seq; - console_idx = log_first_idx; - } else { - len = 0; - } skip: - if (console_seq == log_next_seq) + if (!prb_read_valid(prb, console_seq, &r)) break; - msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (console_seq != r.info->seq) { + console_dropped += r.info->seq - console_seq; + console_seq = r.info->seq; + } + + if (suppress_message_printing(r.info->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and * record that has level above the console loglevel. */ - console_idx = log_next(console_idx); console_seq++; goto skip; } @@ -2461,19 +2501,24 @@ skip: exclusive_console = NULL; } - len += msg_print_text(msg, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time, text + len, sizeof(text) - len); + /* + * Handle extended console text first because later + * record_print_text() will modify the record buffer in-place. + */ if (nr_ext_console_drivers) { - ext_len = msg_print_ext_header(ext_text, + ext_len = info_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq); + r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r.dict_buf[0], + r.info->dict_len, + &r.text_buf[0], + r.info->text_len); } - console_idx = log_next(console_idx); + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); console_seq++; raw_spin_unlock(&logbuf_lock); @@ -2513,7 +2558,7 @@ skip: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - retry = console_seq != log_next_seq; + retry = prb_read_valid(prb, console_seq, NULL); raw_spin_unlock(&logbuf_lock); printk_safe_exit_irqrestore(flags); @@ -2582,8 +2627,7 @@ void console_flush_on_panic(enum con_flush_mode mode) unsigned long flags; logbuf_lock_irqsave(flags); - console_seq = log_first_seq; - console_idx = log_first_idx; + console_seq = prb_first_valid_seq(prb); logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -2826,7 +2870,6 @@ void register_console(struct console *newcon) exclusive_console = newcon; exclusive_console_stop_seq = console_seq; console_seq = syslog_seq; - console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -3215,9 +3258,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ @@ -3251,28 +3292,33 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct printk_log *msg; + struct printk_info info; + unsigned int line_count; + struct printk_record r; size_t l = 0; bool ret = false; + prb_rec_init_rd(&r, &info, line, size, NULL, 0); + if (!dumper->active) goto out; - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; - } - - /* last entry */ - if (dumper->cur_seq >= log_next_seq) - goto out; + /* Read text or count text lines? */ + if (line) { + if (!prb_read_valid(prb, dumper->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { + if (!prb_read_valid_info(prb, dumper->cur_seq, + &info, &line_count)) { + goto out; + } + l = get_record_print_text_size(&info, line_count, syslog, + printk_time); - msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, printk_time, line, size); + } - dumper->cur_idx = log_next(dumper->cur_idx); - dumper->cur_seq++; + dumper->cur_seq = r.info->seq + 1; ret = true; out: if (len) @@ -3333,23 +3379,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, char *buf, size_t size, size_t *len) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; unsigned long flags; u64 seq; - u32 idx; u64 next_seq; - u32 next_idx; size_t l = 0; bool ret = false; bool time = printk_time; - if (!dumper->active) + prb_rec_init_rd(&r, &info, buf, size, NULL, 0); + + if (!dumper->active || !buf || !size) goto out; logbuf_lock_irqsave(flags); - if (dumper->cur_seq < log_first_seq) { + if (dumper->cur_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; + dumper->cur_seq = prb_first_valid_seq(prb); } /* last entry */ @@ -3360,41 +3408,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (prb_read_valid_info(prb, seq, &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l += get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (l >= size && seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (l >= size && prb_read_valid_info(prb, seq, + &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l -= get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* last message in next interation */ next_seq = seq; - next_idx = idx; + /* actually read text into the buffer now */ l = 0; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); + while (prb_read_valid(prb, seq, &r)) { + if (r.info->seq >= dumper->next_seq) + break; + + l += record_print_text(&r, syslog, time); + + /* adjust record to store to remaining buffer space */ + prb_rec_init_rd(&r, &info, buf + l, size - l, NULL, 0); - l += msg_print_text(msg, syslog, time, buf + l, size - l); - idx = log_next(idx); - seq++; + seq = r.info->seq + 1; } dumper->next_seq = next_seq; - dumper->next_idx = next_idx; ret = true; logbuf_unlock_irqrestore(flags); out: @@ -3417,9 +3465,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) { dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); } /** -- cgit v1.2.3 From d397820f36ffe4701343b6ee12687d60db0ed8db Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 21 Jul 2020 15:31:28 +0206 Subject: printk: ringbuffer: support dataless records With commit 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer"), printk() started silently dropping messages without text because such records are not supported by the new printk ringbuffer. Add support for such records. Currently dataless records are denoted by INVALID_LPOS in order to recognize failed prb_reserve() calls. Change the ringbuffer to instead use two different identifiers (FAILED_LPOS and NO_LPOS) to distinguish between failed prb_reserve() records and successful dataless records, respectively. Fixes: 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer") Fixes: https://lkml.kernel.org/r/20200718121053.GA691245@elver.google.com Reported-by: Marco Elver Signed-off-by: John Ogness Cc: Petr Mladek Cc: Steven Rostedt Cc: Marco Elver Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200721132528.9661-1-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 72 +++++++++++++++++++-------------------- kernel/printk/printk_ringbuffer.h | 15 ++++---- 2 files changed, 43 insertions(+), 44 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 7355ca99e852..0659b50872b5 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -264,6 +264,9 @@ /* Determine how many times the data array has wrapped. */ #define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) +/* Determine if a logical position refers to a data-less block. */ +#define LPOS_DATALESS(lpos) ((lpos) & 1UL) + /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ ((lpos) & ~DATA_SIZE_MASK(data_ring)) @@ -320,21 +323,13 @@ static unsigned int to_blk_size(unsigned int size) * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the * assumption is safe. - * - * Writers are also not allowed to write 0-sized (data-less) records. Such - * records are used only internally by the ringbuffer. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { struct prb_data_block *db = NULL; - /* - * Writers are not allowed to write data-less records. Such records - * are used only internally by the ringbuffer to denote records where - * their data failed to allocate or have been lost. - */ if (size == 0) - return false; + return true; /* * Ensure the alignment padded size could possibly fit in the data @@ -568,8 +563,8 @@ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long tail_lpos; unsigned long next_lpos; - /* If @lpos is not valid, there is nothing to do. */ - if (lpos == INVALID_LPOS) + /* If @lpos is from a data-less block, there is nothing to do. */ + if (LPOS_DATALESS(lpos)) return true; /* @@ -962,8 +957,8 @@ static char *data_alloc(struct printk_ringbuffer *rb, if (size == 0) { /* Specify a data-less block. */ - blk_lpos->begin = INVALID_LPOS; - blk_lpos->next = INVALID_LPOS; + blk_lpos->begin = NO_LPOS; + blk_lpos->next = NO_LPOS; return NULL; } @@ -976,8 +971,8 @@ static char *data_alloc(struct printk_ringbuffer *rb, if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { /* Failed to allocate, specify a data-less block. */ - blk_lpos->begin = INVALID_LPOS; - blk_lpos->next = INVALID_LPOS; + blk_lpos->begin = FAILED_LPOS; + blk_lpos->next = FAILED_LPOS; return NULL; } @@ -1025,6 +1020,10 @@ static char *data_alloc(struct printk_ringbuffer *rb, static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { + /* Data-less blocks take no space. */ + if (LPOS_DATALESS(blk_lpos->begin)) + return 0; + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - @@ -1080,11 +1079,8 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; - /* Records are allowed to not have dictionaries. */ - if (r->dict_buf_size) { - if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) - goto fail; - } + if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) + goto fail; /* * Descriptors in the reserved state act as blockers to all further @@ -1205,15 +1201,18 @@ void prb_commit(struct prb_reserved_entry *e) * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ -static char *get_data(struct prb_data_ring *data_ring, - struct prb_data_blk_lpos *blk_lpos, - unsigned int *data_size) +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) { struct prb_data_block *db; /* Data-less data block description. */ - if (blk_lpos->begin == INVALID_LPOS && - blk_lpos->next == INVALID_LPOS) { + if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } return NULL; } @@ -1256,11 +1255,11 @@ static char *get_data(struct prb_data_ring *data_ring, * (even if @text_size is 0). Each '\n' processed is counted as an additional * line. */ -static unsigned int count_lines(char *text, unsigned int text_size) +static unsigned int count_lines(const char *text, unsigned int text_size) { unsigned int next_size = text_size; unsigned int line_count = 1; - char *next = text; + const char *next = text; while (next_size) { next = memchr(next, '\n', next_size); @@ -1287,7 +1286,7 @@ static bool copy_data(struct prb_data_ring *data_ring, unsigned int buf_size, unsigned int *line_count) { unsigned int data_size; - char *data; + const char *data; /* Caller might not want any data. */ if ((!buf || !buf_size) && !line_count) @@ -1317,8 +1316,7 @@ static bool copy_data(struct prb_data_ring *data_ring, data_size = min_t(u16, buf_size, len); - if (!WARN_ON_ONCE(!data_size)) - memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; } @@ -1355,11 +1353,11 @@ static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, /* * A descriptor in the reusable state may no longer have its data - * available; report it as a data-less record. Or the record may - * actually be a data-less record. + * available; report it as existing but with lost data. Or the record + * may actually be a record with lost data. */ if (d_state == desc_reusable || - (blk_lpos->begin == INVALID_LPOS && blk_lpos->next == INVALID_LPOS)) { + (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { return -ENOENT; } @@ -1659,10 +1657,10 @@ void prb_init(struct printk_ringbuffer *rb, descs[_DESCS_COUNT(descbits) - 1].info.seq = 0; atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); - descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = INVALID_LPOS; - descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = INVALID_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = INVALID_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = FAILED_LPOS; } /** diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 3e46a7423c13..e6302da041f9 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -120,12 +120,13 @@ struct prb_reserved_entry { #define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK) #define DESC_ID_MASK (~DESC_FLAGS_MASK) #define DESC_ID(sv) ((sv) & DESC_ID_MASK) -#define INVALID_LPOS 1 +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 -#define INVALID_BLK_LPOS \ +#define FAILED_BLK_LPOS \ { \ - .begin = INVALID_LPOS, \ - .next = INVALID_LPOS, \ + .begin = FAILED_LPOS, \ + .next = FAILED_LPOS, \ } /* @@ -147,7 +148,7 @@ struct prb_reserved_entry { * * To satisfy Req1, the tail initially points to a descriptor that is * minimally initialized (having no data block, i.e. data-less with the - * data block's lpos @begin and @next values set to INVALID_LPOS). + * data block's lpos @begin and @next values set to FAILED_LPOS). * * To satisfy Req2, the initial tail descriptor is initialized to the * reusable state. Readers recognize reusable descriptors as existing @@ -242,8 +243,8 @@ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ - .text_blk_lpos = INVALID_BLK_LPOS, \ - .dict_blk_lpos = INVALID_BLK_LPOS, \ + .text_blk_lpos = FAILED_BLK_LPOS, \ + .dict_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_ringbuffer name = { \ -- cgit v1.2.3 From ce003d67ad521d950687bce7e42e5361022c7d3d Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 11:54:02 +0206 Subject: printk: ringbuffer: fix setting state in desc_read() It is expected that desc_read() will always set at least the @state_var field. However, if the descriptor is in an inconsistent state, no fields are set. Also, the second load of @state_var is not stored in @desc_out and so might not match the state value that is returned. Always set the last loaded @state_var into @desc_out, regardless of the descriptor consistency. Fixes: b6cf8b3f3312 ("printk: add lockless ringbuffer") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914094803.27365-1-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 0659b50872b5..88f7dd4cb0c1 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -368,9 +368,9 @@ static enum desc_state get_desc_state(unsigned long id, } /* - * Get a copy of a specified descriptor and its queried state. A descriptor - * that is not in the committed or reusable state must be considered garbage - * by the reader. + * Get a copy of a specified descriptor and return its queried state. If the + * descriptor is in an inconsistent state (miss or reserved), the caller can + * only expect the descriptor's @state_var field to be valid. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, unsigned long id, struct prb_desc *desc_out) @@ -383,8 +383,14 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, /* Check the descriptor state. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ d_state = get_desc_state(id, state_val); - if (d_state != desc_committed && d_state != desc_reusable) - return d_state; + if (d_state == desc_miss || d_state == desc_reserved) { + /* + * The descriptor is in an inconsistent state. Set at least + * @state_var so that the caller can see the details of + * the inconsistent state. + */ + goto out; + } /* * Guarantee the state is loaded before copying the descriptor @@ -449,9 +455,15 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, */ smp_rmb(); /* LMM(desc_read:D) */ - /* Re-check the descriptor state. */ + /* + * The data has been copied. Return the current descriptor state, + * which may have changed since the load above. + */ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ - return get_desc_state(id, state_val); + d_state = get_desc_state(id, state_val); +out: + atomic_long_set(&desc_out->state_var, state_val); + return d_state; } /* -- cgit v1.2.3 From e7c1fe21046a024a5a9ef46e3752521afc1ada7b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 11:54:03 +0206 Subject: printk: ringbuffer: avoid memcpy() on state_var @state_var is copied as part of the descriptor copying via memcpy(). This is not allowed because @state_var is an atomic type, which in some implementations may contain a spinlock. Avoid using memcpy() with @state_var by explicitly copying the other fields of the descriptor. @state_var is set using atomic set operator before returning. Fixes: b6cf8b3f3312 ("printk: add lockless ringbuffer") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914094803.27365-2-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 88f7dd4cb0c1..11b860ad5264 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -412,9 +412,14 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, /* * Copy the descriptor data. The data is not valid until the - * state has been re-checked. + * state has been re-checked. A memcpy() for all of @desc + * cannot be used because of the atomic_t @state_var field. */ - memcpy(desc_out, desc, sizeof(*desc_out)); /* LMM(desc_read:C) */ + memcpy(&desc_out->info, &desc->info, sizeof(desc_out->info)); /* LMM(desc_read:C) */ + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* also part of desc_read:C */ + memcpy(&desc_out->dict_blk_lpos, &desc->dict_blk_lpos, + sizeof(desc_out->dict_blk_lpos)); /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking -- cgit v1.2.3 From 2a7f87ed05b9035ef818d0f46fe6aef5640224e3 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:49 +0206 Subject: printk: ringbuffer: relocate get_data() Move the internal get_data() function as-is above prb_reserve() so that a later change can make use of the static function. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-2-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 116 +++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 58 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 11b860ad5264..664552cb931c 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1055,6 +1055,64 @@ static unsigned int space_used(struct prb_data_ring *data_ring, DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); } +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + /** * prb_reserve() - Reserve space in the ringbuffer. * @@ -1209,64 +1267,6 @@ void prb_commit(struct prb_reserved_entry *e) local_irq_restore(e->irqflags); } -/* - * Given @blk_lpos, return a pointer to the writer data from the data block - * and calculate the size of the data part. A NULL pointer is returned if - * @blk_lpos specifies values that could never be legal. - * - * This function (used by readers) performs strict validation on the lpos - * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is - * triggered if an internal error is detected. - */ -static const char *get_data(struct prb_data_ring *data_ring, - struct prb_data_blk_lpos *blk_lpos, - unsigned int *data_size) -{ - struct prb_data_block *db; - - /* Data-less data block description. */ - if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { - if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { - *data_size = 0; - return ""; - } - return NULL; - } - - /* Regular data block: @begin less than @next and in same wrap. */ - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && - blk_lpos->begin < blk_lpos->next) { - db = to_block(data_ring, blk_lpos->begin); - *data_size = blk_lpos->next - blk_lpos->begin; - - /* Wrapping data block: @begin is one wrap behind @next. */ - } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == - DATA_WRAPS(data_ring, blk_lpos->next)) { - db = to_block(data_ring, 0); - *data_size = DATA_INDEX(data_ring, blk_lpos->next); - - /* Illegal block description. */ - } else { - WARN_ON_ONCE(1); - return NULL; - } - - /* A valid data block will always be aligned to the ID size. */ - if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || - WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { - return NULL; - } - - /* A valid data block will always have at least an ID. */ - if (WARN_ON_ONCE(*data_size < sizeof(db->id))) - return NULL; - - /* Subtract block ID space from size to reflect data size. */ - *data_size -= sizeof(db->id); - - return &db->data[0]; -} - /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional -- cgit v1.2.3 From e3bc0401c1de2feb15601685b85b4dea45130386 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:50 +0206 Subject: printk: ringbuffer: add BLK_DATALESS() macro Rather than continually needing to explicitly check @begin and @next to identify a dataless block, introduce and use a BLK_DATALESS() macro. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-3-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 664552cb931c..195e6f4d4df6 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -266,6 +266,8 @@ /* Determine if a logical position refers to a data-less block. */ #define LPOS_DATALESS(lpos) ((lpos) & 1UL) +#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ + LPOS_DATALESS((blk)->next)) /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ @@ -1038,7 +1040,7 @@ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { /* Data-less blocks take no space. */ - if (LPOS_DATALESS(blk_lpos->begin)) + if (BLK_DATALESS(blk_lpos)) return 0; if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { @@ -1071,7 +1073,7 @@ static const char *get_data(struct prb_data_ring *data_ring, struct prb_data_block *db; /* Data-less data block description. */ - if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { + if (BLK_DATALESS(blk_lpos)) { if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { *data_size = 0; return ""; -- cgit v1.2.3 From cc5c7041c6e1fe8c02fe9e16f28a5e52f7a6957c Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:51 +0206 Subject: printk: ringbuffer: clear initial reserved fields prb_reserve() will set some meta data values and leave others uninitialized (or rather, containing the values of the previous wrap). Simplify the API by always clearing out all the fields. Only the sequence number is filled in. The caller is now responsible for filling in the rest of the meta data fields. In particular, for correctly filling in text and dict lengths. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 12 ++++++++---- kernel/printk/printk_ringbuffer.c | 30 ++++++++++++++++++------------ 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fec71229169e..964b5701688f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -520,8 +520,11 @@ static int log_store(u32 caller_id, int facility, int level, memcpy(&r.text_buf[0], text, text_len); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); - if (r.dict_buf) + r.info->text_len = text_len + trunc_msg_len; + if (r.dict_buf) { memcpy(&r.dict_buf[0], dict, dict_len); + r.info->dict_len = dict_len; + } r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; @@ -1069,10 +1072,11 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, if (!prb_reserve(&e, rb, &dest_r)) return 0; - memcpy(&dest_r.text_buf[0], &r->text_buf[0], dest_r.text_buf_size); + memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); + dest_r.info->text_len = r->info->text_len; if (dest_r.dict_buf) { - memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], - dest_r.dict_buf_size); + memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], r->info->dict_len); + dest_r.info->dict_len = r->info->dict_len; } dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 195e6f4d4df6..5a9c7c8cff7b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -146,10 +146,13 @@ * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * r.info->text_len = strlen(textstr); * * // dictionary allocation may have failed - * if (r.dict_buf) + * if (r.dict_buf) { * snprintf(r.dict_buf, r.dict_buf_size, "%s", dictstr); + * r.info->dict_len = strlen(dictstr); + * } * * r.info->ts_nsec = local_clock(); * @@ -1142,9 +1145,9 @@ static const char *get_data(struct prb_data_ring *data_ring, * @dict_buf_size is set to 0. Writers must check this before writing to * dictionary space. * - * @info->text_len and @info->dict_len will already be set to @text_buf_size - * and @dict_buf_size, respectively. If dictionary space reservation fails, - * @info->dict_len is set to 0. + * Important: @info->text_len and @info->dict_len need to be set correctly by + * the writer in order for data to be readable and/or extended. + * Their values are initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) @@ -1152,6 +1155,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_desc *d; unsigned long id; + u64 seq; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; @@ -1176,6 +1180,14 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, d = to_desc(desc_ring, id); + /* + * All @info fields (except @seq) are cleared and must be filled in + * by the writer. Save @seq before clearing because it is used to + * determine the new sequence number. + */ + seq = d->info.seq; + memset(&d->info, 0, sizeof(d->info)); + /* * Set the @e fields here so that prb_commit() can be used if * text data allocation fails. @@ -1194,17 +1206,15 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ - if (d->info.seq == 0 && DESC_INDEX(desc_ring, id) != 0) + if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) d->info.seq = DESC_INDEX(desc_ring, id); else - d->info.seq += DESCS_COUNT(desc_ring); + d->info.seq = seq + DESCS_COUNT(desc_ring); r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { - d->info.text_len = 0; - d->info.dict_len = 0; prb_commit(e); /* prb_commit() re-enabled interrupts. */ goto fail; @@ -1221,10 +1231,6 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, r->info = &d->info; - /* Set default values for the sizes. */ - d->info.text_len = r->text_buf_size; - d->info.dict_len = r->dict_buf_size; - /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); -- cgit v1.2.3 From 10dcb06d40411a73e1ae111717e9a987bb760313 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:52 +0206 Subject: printk: ringbuffer: change representation of states Rather than deriving the state by evaluating bits within the flags area of the state variable, assign the states explicit values and set those values in the flags area. Introduce macros to make it simple to read and write state values for the state variable. Although the functionality is preserved, the binary representation for the states is changed. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-5-john.ogness@linutronix.de --- Documentation/admin-guide/kdump/gdbmacros.txt | 12 +++++++---- kernel/printk/printk_ringbuffer.c | 28 ++++++------------------ kernel/printk/printk_ringbuffer.h | 31 +++++++++++++++++---------- scripts/gdb/linux/dmesg.py | 11 +++++----- 4 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel/printk') diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt index 7adece30237e..8f533b751c46 100644 --- a/Documentation/admin-guide/kdump/gdbmacros.txt +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -295,9 +295,12 @@ document dump_record end define dmesg - set var $desc_committed = 1UL << ((sizeof(long) * 8) - 1) - set var $flags_mask = 3UL << ((sizeof(long) * 8) - 2) - set var $id_mask = ~$flags_mask + # definitions from kernel/printk/printk_ringbuffer.h + set var $desc_committed = 1 + set var $desc_sv_bits = sizeof(long) * 8 + set var $desc_flags_shift = $desc_sv_bits - 2 + set var $desc_flags_mask = 3 << $desc_flags_shift + set var $id_mask = ~$desc_flags_mask set var $desc_count = 1U << prb->desc_ring.count_bits set var $prev_flags = 0 @@ -309,7 +312,8 @@ define dmesg set var $desc = &prb->desc_ring.descs[$id % $desc_count] # skip non-committed record - if (($desc->state_var.counter & $flags_mask) == $desc_committed) + set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift) + if ($state == $desc_committed) dump_record $desc $prev_flags set var $prev_flags = $desc->info.flags end diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 5a9c7c8cff7b..c0d31185ccbf 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -348,14 +348,6 @@ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) return true; } -/* The possible responses of a descriptor state-query. */ -enum desc_state { - desc_miss, /* ID mismatch */ - desc_reserved, /* reserved, in use by writer */ - desc_committed, /* committed, writer is done */ - desc_reusable, /* free, not yet used by any writer */ -}; - /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) @@ -363,13 +355,7 @@ static enum desc_state get_desc_state(unsigned long id, if (id != DESC_ID(state_val)) return desc_miss; - if (state_val & DESC_REUSE_MASK) - return desc_reusable; - - if (state_val & DESC_COMMITTED_MASK) - return desc_committed; - - return desc_reserved; + return DESC_STATE(state_val); } /* @@ -484,8 +470,8 @@ out: static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { - unsigned long val_committed = id | DESC_COMMITTED_MASK; - unsigned long val_reusable = val_committed | DESC_REUSE_MASK; + unsigned long val_committed = DESC_SV(id, desc_committed); + unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; @@ -921,7 +907,7 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) */ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ if (prev_state_val && - prev_state_val != (id_prev_wrap | DESC_COMMITTED_MASK | DESC_REUSE_MASK)) { + get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { WARN_ON_ONCE(1); return false; } @@ -935,7 +921,7 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) * This pairs with desc_read:D. */ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, - id | 0)) { /* LMM(desc_reserve:F) */ + DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ WARN_ON_ONCE(1); return false; } @@ -1254,7 +1240,7 @@ void prb_commit(struct prb_reserved_entry *e) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); - unsigned long prev_state_val = e->id | 0; + unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); /* Now the writer has finished all writing: LMM(prb_commit:A) */ @@ -1267,7 +1253,7 @@ void prb_commit(struct prb_reserved_entry *e) * this. This pairs with desc_read:B. */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, - e->id | DESC_COMMITTED_MASK)) { /* LMM(prb_commit:B) */ + DESC_SV(e->id, desc_committed))) { /* LMM(prb_commit:B) */ WARN_ON_ONCE(1); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index e6302da041f9..a9d85a6727b1 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -112,16 +112,25 @@ struct prb_reserved_entry { unsigned int text_space; }; -#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) -#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) -#define DESC_SV_BITS (sizeof(unsigned long) * 8) -#define DESC_COMMITTED_MASK (1UL << (DESC_SV_BITS - 1)) -#define DESC_REUSE_MASK (1UL << (DESC_SV_BITS - 2)) -#define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK) -#define DESC_ID_MASK (~DESC_FLAGS_MASK) -#define DESC_ID(sv) ((sv) & DESC_ID_MASK) -#define FAILED_LPOS 0x1 -#define NO_LPOS 0x3 +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss = -1, /* ID mismatch (pseudo state) */ + desc_reserved = 0x0, /* reserved, in use by writer */ + desc_committed = 0x1, /* committed by writer */ + desc_reusable = 0x3, /* free, not yet used by any writer */ +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) +#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) +#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) +#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 #define FAILED_BLK_LPOS \ { \ @@ -213,7 +222,7 @@ struct prb_reserved_entry { */ #define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) #define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) -#define DESC0_SV(ct_bits) (DESC_COMMITTED_MASK | DESC_REUSE_MASK | DESC0_ID(ct_bits)) +#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) /* * Define a ringbuffer with an external text data buffer. The same as diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py index 6c6022012ea8..dd8c0b95063a 100644 --- a/scripts/gdb/linux/dmesg.py +++ b/scripts/gdb/linux/dmesg.py @@ -78,10 +78,10 @@ class LxDmesg(gdb.Command): len_off = off + printk_info_type.get_type()['text_len'].bitpos // 8 # definitions from kernel/printk/printk_ringbuffer.h + desc_committed = 1 desc_sv_bits = utils.get_long_type().sizeof * 8 - desc_committed_mask = 1 << (desc_sv_bits - 1) - desc_reuse_mask = 1 << (desc_sv_bits - 2) - desc_flags_mask = desc_committed_mask | desc_reuse_mask + desc_flags_shift = desc_sv_bits - 2 + desc_flags_mask = 3 << desc_flags_shift desc_id_mask = ~desc_flags_mask # read in tail and head descriptor ids @@ -96,8 +96,9 @@ class LxDmesg(gdb.Command): desc_off = desc_sz * ind # skip non-committed record - state = utils.read_u64(descs, desc_off + sv_off + counter_off) & desc_flags_mask - if state != desc_committed_mask: + state = 3 & (utils.read_u64(descs, desc_off + sv_off + + counter_off) >> desc_flags_shift) + if state != desc_committed: if did == head_id: break did = (did + 1) & desc_id_mask -- cgit v1.2.3 From 4cfc7258f876a7feba673ac6d050f525b39cc84c Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:53 +0206 Subject: printk: ringbuffer: add finalization/extension support Add support for extending the newest data block. For this, introduce a new finalization state (desc_finalized) denoting a committed descriptor that cannot be extended. Until a record is finalized, a writer can reopen that record to append new data. Reopening a record means transitioning from the desc_committed state back to the desc_reserved state. A writer can explicitly finalize a record if there is no intention of extending it. Also, records are automatically finalized when a new record is reserved. This relieves writers of needing to explicitly finalize while also making such records available to readers sooner. (Readers can only traverse finalized records.) Four new memory barrier pairs are introduced. Two of them are insignificant additions (data_realloc:A/desc_read:D and data_realloc:A/data_push_tail:B) because they are alternate path memory barriers that exactly match the purpose, pairing, and context of the two existing memory barrier pairs they provide an alternate path for. The other two new memory barrier pairs are significant additions: desc_reopen_last:A / _prb_commit:B - When reopening a descriptor, ensure the state transitions back to desc_reserved before fully trusting the descriptor data. _prb_commit:B / desc_reserve:D - When committing a descriptor, ensure the state transitions to desc_committed before checking the head ID to see if the descriptor needs to be finalized. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-6-john.ogness@linutronix.de --- Documentation/admin-guide/kdump/gdbmacros.txt | 3 +- kernel/printk/printk_ringbuffer.c | 525 +++++++++++++++++++++++--- kernel/printk/printk_ringbuffer.h | 6 +- scripts/gdb/linux/dmesg.py | 3 +- 4 files changed, 480 insertions(+), 57 deletions(-) (limited to 'kernel/printk') diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt index 8f533b751c46..94fabb165abf 100644 --- a/Documentation/admin-guide/kdump/gdbmacros.txt +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -297,6 +297,7 @@ end define dmesg # definitions from kernel/printk/printk_ringbuffer.h set var $desc_committed = 1 + set var $desc_finalized = 2 set var $desc_sv_bits = sizeof(long) * 8 set var $desc_flags_shift = $desc_sv_bits - 2 set var $desc_flags_mask = 3 << $desc_flags_shift @@ -313,7 +314,7 @@ define dmesg # skip non-committed record set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift) - if ($state == $desc_committed) + if ($state == $desc_committed || $state == $desc_finalized) dump_record $desc $prev_flags set var $prev_flags = $desc->info.flags end diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index c0d31185ccbf..f4e2e9890e0f 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -46,20 +46,26 @@ * into a single descriptor field named @state_var, allowing ID and state to * be synchronously and atomically updated. * - * Descriptors have three states: + * Descriptors have four states: * * reserved * A writer is modifying the record. * * committed - * The record and all its data are complete and available for reading. + * The record and all its data are written. A writer can reopen the + * descriptor (transitioning it back to reserved), but in the committed + * state the data is consistent. + * + * finalized + * The record and all its data are complete and available for reading. A + * writer cannot reopen the descriptor. * * reusable * The record exists, but its text and/or dictionary data may no longer * be available. * * Querying the @state_var of a record requires providing the ID of the - * descriptor to query. This can yield a possible fourth (pseudo) state: + * descriptor to query. This can yield a possible fifth (pseudo) state: * * miss * The descriptor being queried has an unexpected ID. @@ -79,6 +85,28 @@ * committed or reusable queried state. This makes it possible that a valid * sequence number of the tail is always available. * + * Descriptor Finalization + * ~~~~~~~~~~~~~~~~~~~~~~~ + * When a writer calls the commit function prb_commit(), record data is + * fully stored and is consistent within the ringbuffer. However, a writer can + * reopen that record, claiming exclusive access (as with prb_reserve()), and + * modify that record. When finished, the writer must again commit the record. + * + * In order for a record to be made available to readers (and also become + * recyclable for writers), it must be finalized. A finalized record cannot be + * reopened and can never become "unfinalized". Record finalization can occur + * in three different scenarios: + * + * 1) A writer can simultaneously commit and finalize its record by calling + * prb_final_commit() instead of prb_commit(). + * + * 2) When a new record is reserved and the previous record has been + * committed via prb_commit(), that previous record is automatically + * finalized. + * + * 3) When a record is committed via prb_commit() and a newer record + * already exists, the record being committed is automatically finalized. + * * Data Rings * ~~~~~~~~~~ * The two data rings (text and dictionary) function identically. They exist @@ -97,7 +125,7 @@ * are met: * * 1) The descriptor associated with the data block is in the committed - * queried state. + * or finalized queried state. * * 2) The blk_lpos struct within the descriptor associated with the data * block references back to the same data block. @@ -156,9 +184,38 @@ * * r.info->ts_nsec = local_clock(); * + * prb_final_commit(&e); + * } + * + * Note that additional writer functions are available to extend a record + * after it has been committed but not yet finalized. This can be done as + * long as no new records have been reserved and the caller is the same. + * + * Sample writer code (record extending):: + * + * // alternate rest of previous example + * r.info->ts_nsec = local_clock(); + * r.info->text_len = strlen(textstr); + * r.info->caller_id = printk_caller_id(); + * + * // commit the record (but do not finalize yet) * prb_commit(&e); * } * + * ... + * + * // specify additional 5 bytes text space to extend + * prb_rec_init_wr(&r, 5, 0); + * + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id())) { + * snprintf(&r.text_buf[r.info->text_len], + * r.text_buf_size - r.info->text_len, "hello"); + * + * r.info->text_len += 5; + * + * prb_final_commit(&e); + * } + * * Sample reader code:: * * struct printk_info info; @@ -236,15 +293,21 @@ * desc_reserve:F / desc_read:D * set new descriptor id and reserved (state), then allow writer changes * - * data_alloc:A / desc_read:D + * data_alloc:A (or data_realloc:A) / desc_read:D * set old descriptor reusable (state), then modify new data block area * - * data_alloc:A / data_push_tail:B + * data_alloc:A (or data_realloc:A) / data_push_tail:B * push data tail (lpos), then modify new data block area * - * prb_commit:B / desc_read:B + * _prb_commit:B / desc_read:B * store writer changes, then set new descriptor committed (state) * + * desc_reopen_last:A / _prb_commit:B + * set descriptor reserved (state), then read descriptor data + * + * _prb_commit:B / desc_reserve:D + * set new descriptor committed (state), then check descriptor head (id) + * * data_push_tail:D / data_push_tail:A * set descriptor reusable (state), then push data tail (lpos) * @@ -386,16 +449,16 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, /* * Guarantee the state is loaded before copying the descriptor * content. This avoids copying obsolete descriptor content that might - * not apply to the descriptor state. This pairs with prb_commit:B. + * not apply to the descriptor state. This pairs with _prb_commit:B. * * Memory barrier involvement: * - * If desc_read:A reads from prb_commit:B, then desc_read:C reads - * from prb_commit:A. + * If desc_read:A reads from _prb_commit:B, then desc_read:C reads + * from _prb_commit:A. * * Relies on: * - * WMB from prb_commit:A to prb_commit:B + * WMB from _prb_commit:A to _prb_commit:B * matching * RMB from desc_read:A to desc_read:C */ @@ -431,7 +494,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * * 2. Guarantee the record data is loaded before re-checking the * state. This avoids reading an obsolete descriptor state that may - * not apply to the copied data. This pairs with data_alloc:A. + * not apply to the copied data. This pairs with data_alloc:A and + * data_realloc:A. * * Memory barrier involvement: * @@ -463,19 +527,19 @@ out: } /* - * Take a specified descriptor out of the committed state by attempting - * the transition from committed to reusable. Either this context or some + * Take a specified descriptor out of the finalized state by attempting + * the transition from finalized to reusable. Either this context or some * other context will have been successful. */ static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { - unsigned long val_committed = DESC_SV(id, desc_committed); + unsigned long val_finalized = DESC_SV(id, desc_finalized); unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; - atomic_long_cmpxchg_relaxed(state_var, val_committed, + atomic_long_cmpxchg_relaxed(state_var, val_finalized, val_reusable); /* LMM(desc_make_reusable:A) */ } @@ -484,7 +548,7 @@ static void desc_make_reusable(struct prb_desc_ring *desc_ring, * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either - * the descriptor has not yet been committed or another writer context has + * the descriptor has not yet been finalized or another writer context has * already pushed the tail lpos past the problematic data block. Regardless, * on error the caller can re-load the tail lpos to determine the situation. */ @@ -528,10 +592,10 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, switch (d_state) { case desc_miss: - return false; case desc_reserved: - return false; case desc_committed: + return false; + case desc_finalized: /* * This data block is invalid if the descriptor * does not point back to it. @@ -616,7 +680,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, * data_make_reusable() may be due to a newly * recycled data area causing the tail lpos to * have been previously pushed. This pairs with - * data_alloc:A. + * data_alloc:A and data_realloc:A. * * Memory barrier involvement: * @@ -729,8 +793,9 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, */ return true; case desc_reserved: - return false; case desc_committed: + return false; + case desc_finalized: desc_make_reusable(desc_ring, tail_id); break; case desc_reusable: @@ -751,7 +816,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, /* * Check the next descriptor after @tail_id before pushing the tail - * to it because the tail must always be in a committed or reusable + * to it because the tail must always be in a finalized or reusable * state. The implementation of prb_first_seq() relies on this. * * A successful read implies that the next descriptor is less than or @@ -760,7 +825,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, */ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc); /* LMM(desc_push_tail:A) */ - if (d_state == desc_committed || d_state == desc_reusable) { + if (d_state == desc_finalized || d_state == desc_reusable) { /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail ID. This allows @@ -895,6 +960,10 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) * another CPU may have pushed the tail ID. This pairs * with desc_push_tail:C and this also pairs with * prb_first_seq:C. + * + * 5. Guarantee the head ID is stored before trying to + * finalize the previous descriptor. This pairs with + * _prb_commit:B. */ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, id)); /* LMM(desc_reserve:D) */ @@ -1024,6 +1093,84 @@ static char *data_alloc(struct printk_ringbuffer *rb, return &blk->data[0]; } +/* + * Try to resize an existing data block associated with the descriptor + * specified by @id. If the resized data block should become wrapped, it + * copies the old data to the new data block. If @size yields a data block + * with the same or less size, the data block is left as is. + * + * Fail if this is not the last allocated data block or if there is not + * enough space or it is not possible make enough space. + * + * Return a pointer to the beginning of the entire data buffer or NULL on + * failure. + */ +static char *data_realloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long head_lpos; + unsigned long next_lpos; + bool wrapped; + + /* Reallocation only works if @blk_lpos is the newest data block. */ + head_lpos = atomic_long_read(&data_ring->head_lpos); + if (head_lpos != blk_lpos->next) + return NULL; + + /* Keep track if @blk_lpos was a wrapping data block. */ + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + + size = to_blk_size(size); + + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); + + /* If the data block does not increase, there is nothing to do. */ + if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, blk_lpos->begin); + return &blk->data[0]; + } + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) + return NULL; + + /* The memory barrier involvement is the same as data_alloc:A. */ + if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, + next_lpos)) { /* LMM(data_realloc:A) */ + return NULL; + } + + blk = to_block(data_ring, blk_lpos->begin); + + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + struct prb_data_block *old_blk = blk; + + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + + if (!wrapped) { + /* + * Since the allocated space is now in the newly + * created wrapping data block, copy the content + * from the old data block. + */ + memcpy(&blk->data[0], &old_blk->data[0], + (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); + } + } + + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + /* Return the number of bytes used by a data block. */ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) @@ -1104,6 +1251,206 @@ static const char *get_data(struct prb_data_ring *data_ring, return &db->data[0]; } +/* + * Attempt to transition the newest descriptor from committed back to reserved + * so that the record can be modified by a writer again. This is only possible + * if the descriptor is not yet finalized and the provided @caller_id matches. + */ +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, + u32 caller_id, unsigned long *id_out) +{ + unsigned long prev_state_val; + enum desc_state d_state; + struct prb_desc desc; + struct prb_desc *d; + unsigned long id; + + id = atomic_long_read(&desc_ring->head_id); + + /* + * To reduce unnecessarily reopening, first check if the descriptor + * state and caller ID are correct. + */ + d_state = desc_read(desc_ring, id, &desc); + if (d_state != desc_committed || desc.info.caller_id != caller_id) + return NULL; + + d = to_desc(desc_ring, id); + + prev_state_val = DESC_SV(id, desc_committed); + + /* + * Guarantee the reserved state is stored before reading any + * record data. A full memory barrier is needed because @state_var + * modification is followed by reading. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_reopen_last:A reads from _prb_commit:B, then + * prb_reserve_in_last:A reads from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * MB If desc_reopen_last:A to prb_reserve_in_last:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ + return NULL; + } + + *id_out = id; + return d; +} + +/** + * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer + * used by the newest record. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to re-reserve and extend data in. + * @r: The record structure to allocate buffers for. + * @caller_id: The caller ID of the caller (reserving writer). + * + * This is the public function available to writers to re-reserve and extend + * data. + * + * The writer specifies the text size to extend (not the new total size) by + * setting the @text_buf_size field of @r. Extending dictionaries is not + * supported, so @dict_buf_size of @r should be set to 0. To ensure proper + * initialization of @r, prb_rec_init_wr() should be used. + * + * This function will fail if @caller_id does not match the caller ID of the + * newest record. In that case the caller must reserve new data using + * prb_reserve(). + * + * Context: Any context. Disables local interrupts on success. + * Return: true if text data could be extended, otherwise false. + * + * On success: + * + * - @r->text_buf points to the beginning of the entire text buffer. + * + * - @r->text_buf_size is set to the new total size of the buffer. + * + * - @r->dict_buf and @r->dict_buf_size are cleared because extending + * the dict buffer is not supported. + * + * - @r->info is not touched so that @r->info->text_len could be used + * to append the text. + * + * - prb_record_text_space() can be used on @e to query the new + * actually used space. + * + * Important: All @r->info fields will already be set with the current values + * for the record. I.e. @r->info->text_len will be less than + * @text_buf_size and @r->info->dict_len may be set, even though + * @dict_buf_size is 0. Writers can use @r->info->text_len to know + * where concatenation begins and writers should update + * @r->info->text_len after concatenating. + */ +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id) +{ + unsigned int data_size; + struct prb_desc *d; + unsigned long id; + + local_irq_save(e->irqflags); + + /* Transition the newest descriptor back to the reserved state. */ + d = desc_reopen_last(&rb->desc_ring, caller_id, &id); + if (!d) { + local_irq_restore(e->irqflags); + goto fail_reopen; + } + + /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + + /* + * Set the @e fields here so that prb_commit() can be used if + * anything fails from now on. + */ + e->rb = rb; + e->id = id; + + /* + * desc_reopen_last() checked the caller_id, but there was no + * exclusive access at that point. The descriptor may have + * changed since then. + */ + if (caller_id != d->info.caller_id) + goto fail; + + if (BLK_DATALESS(&d->text_blk_lpos)) { + if (WARN_ON_ONCE(d->info.text_len != 0)) { + pr_warn_once("wrong text_len value (%hu, expecting 0)\n", + d->info.text_len); + d->info.text_len = 0; + } + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } else { + if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) + goto fail; + + /* + * Increase the buffer size to include the original size. If + * the meta data (@text_len) is not sane, use the full data + * block size. + */ + if (WARN_ON_ONCE(d->info.text_len > data_size)) { + pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", + d->info.text_len, data_size); + d->info.text_len = data_size; + } + r->text_buf_size += d->info.text_len; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } + if (r->text_buf_size && !r->text_buf) + goto fail; + + /* Although dictionary data may be in use, it cannot be extended. */ + r->dict_buf = NULL; + r->dict_buf_size = 0; + + r->info = &d->info; + + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ +fail_reopen: + /* Make it clear to the caller that the re-reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* + * Attempt to finalize a specified descriptor. If this fails, the descriptor + * is either already final or it will finalize itself when the writer commits. + */ +static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) +{ + unsigned long prev_state_val = DESC_SV(id, desc_committed); + struct prb_desc *d = to_desc(desc_ring, id); + + atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, + DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ +} + /** * prb_reserve() - Reserve space in the ringbuffer. * @@ -1197,6 +1544,15 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, else d->info.seq = seq + DESCS_COUNT(desc_ring); + /* + * New data is about to be reserved. Once that happens, previous + * descriptors are no longer able to be extended. Finalize the + * previous descriptor now so that it can be made available to + * readers. (For seq==0 there is no previous descriptor.) + */ + if (d->info.seq > 0) + desc_make_final(desc_ring, DESC_ID(id - 1)); + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ @@ -1227,33 +1583,40 @@ fail: return false; } -/** - * prb_commit() - Commit (previously reserved) data to the ringbuffer. - * - * @e: The entry containing the reserved data information. - * - * This is the public function available to writers to commit data. - * - * Context: Any context. Enables local interrupts. - */ -void prb_commit(struct prb_reserved_entry *e) +/* Commit the data (possibly finalizing it) and restore interrupts. */ +static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); - /* Now the writer has finished all writing: LMM(prb_commit:A) */ + /* Now the writer has finished all writing: LMM(_prb_commit:A) */ /* * Set the descriptor as committed. See "ABA Issues" about why * cmpxchg() instead of set() is used. * - * Guarantee all record data is stored before the descriptor state - * is stored as committed. A write memory barrier is sufficient for - * this. This pairs with desc_read:B. + * 1 Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient + * for this. This pairs with desc_read:B and desc_reopen_last:A. + * + * 2. Guarantee the descriptor state is stored as committed before + * re-checking the head ID in order to possibly finalize this + * descriptor. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_commit:A reads from desc_reserve:D, then + * desc_make_final:A reads from _prb_commit:B. + * + * Relies on: + * + * MB _prb_commit:B to prb_commit:A + * matching + * MB desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, - DESC_SV(e->id, desc_committed))) { /* LMM(prb_commit:B) */ + DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ WARN_ON_ONCE(1); } @@ -1261,6 +1624,59 @@ void prb_commit(struct prb_reserved_entry *e) local_irq_restore(e->irqflags); } +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Note that the data is not yet available to readers until it is finalized. + * Finalizing happens automatically when space for the next record is + * reserved. + * + * See prb_final_commit() for a version of this function that finalizes + * immediately. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + unsigned long head_id; + + _prb_commit(e, desc_committed); + + /* + * If this descriptor is no longer the head (i.e. a new record has + * been allocated), extending the data for this record is no longer + * allowed and therefore it must be finalized. + */ + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ + if (head_id != e->id) + desc_make_final(desc_ring, e->id); +} + +/** + * prb_final_commit() - Commit and finalize (previously reserved) data to + * the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit+finalize data. + * + * By finalizing, the data is made immediately available to readers. + * + * This function should only be used if there are no intentions of extending + * this data using prb_reserve_in_last(). + * + * Context: Any context. Enables local interrupts. + */ +void prb_final_commit(struct prb_reserved_entry *e) +{ + _prb_commit(e, desc_finalized); +} + /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional @@ -1312,7 +1728,7 @@ static bool copy_data(struct prb_data_ring *data_ring, * because of the trailing alignment padding. */ if (WARN_ON_ONCE(data_size < (unsigned int)len)) { - pr_warn_once("wrong data size (%u, expecting %hu) for data: %.*s\n", + pr_warn_once("wrong data size (%u, expecting >=%hu) for data: %.*s\n", data_size, len, data_size, data); return false; } @@ -1333,16 +1749,16 @@ static bool copy_data(struct prb_data_ring *data_ring, /* * This is an extended version of desc_read(). It gets a copy of a specified - * descriptor. However, it also verifies that the record is committed and has + * descriptor. However, it also verifies that the record is finalized and has * the sequence number @seq. On success, 0 is returned. * * Error return values: - * -EINVAL: A committed record with sequence number @seq does not exist. - * -ENOENT: A committed record with sequence number @seq exists, but its data + * -EINVAL: A finalized record with sequence number @seq does not exist. + * -ENOENT: A finalized record with sequence number @seq exists, but its data * is not available. This is a valid record, so readers should * continue with the next record. */ -static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, +static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, unsigned long id, u64 seq, struct prb_desc *desc_out) { @@ -1353,11 +1769,12 @@ static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, /* * An unexpected @id (desc_miss) or @seq mismatch means the record - * does not exist. A descriptor in the reserved state means the - * record does not yet exist for the reader. + * does not exist. A descriptor in the reserved or committed state + * means the record does not yet exist for the reader. */ if (d_state == desc_miss || d_state == desc_reserved || + d_state == desc_committed || desc_out->info.seq != seq) { return -EINVAL; } @@ -1379,7 +1796,7 @@ static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, * Copy the ringbuffer data from the record with @seq to the provided * @r buffer. On success, 0 is returned. * - * See desc_read_committed_seq() for error return values. + * See desc_read_finalized_seq() for error return values. */ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) @@ -1395,7 +1812,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, id = DESC_ID(atomic_long_read(state_var)); /* Get a local copy of the correct descriptor (if available). */ - err = desc_read_committed_seq(desc_ring, id, seq, &desc); + err = desc_read_finalized_seq(desc_ring, id, seq, &desc); /* * If @r is NULL, the caller is only interested in the availability @@ -1425,8 +1842,8 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, r->info->dict_len = 0; } - /* Ensure the record is still committed and has the same @seq. */ - return desc_read_committed_seq(desc_ring, id, seq, &desc); + /* Ensure the record is still finalized and has the same @seq. */ + return desc_read_finalized_seq(desc_ring, id, seq, &desc); } /* Get the sequence number of the tail descriptor. */ @@ -1444,9 +1861,9 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) /* * This loop will not be infinite because the tail is - * _always_ in the committed or reusable state. + * _always_ in the finalized or reusable state. */ - if (d_state == desc_committed || d_state == desc_reusable) + if (d_state == desc_finalized || d_state == desc_reusable) break; /* @@ -1473,8 +1890,8 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) } /* - * Non-blocking read of a record. Updates @seq to the last committed record - * (which may have no data). + * Non-blocking read of a record. Updates @seq to the last finalized record + * (which may have no data available). * * See the description of prb_read_valid() and prb_read_valid_info() * for details. @@ -1500,7 +1917,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, (*seq)++; } else { - /* Non-existent/non-committed record. Must stop. */ + /* Non-existent/non-finalized record. Must stop. */ return false; } } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index a9d85a6727b1..853ea62dc5f2 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -116,7 +116,8 @@ struct prb_reserved_entry { enum desc_state { desc_miss = -1, /* ID mismatch (pseudo state) */ desc_reserved = 0x0, /* reserved, in use by writer */ - desc_committed = 0x1, /* committed by writer */ + desc_committed = 0x1, /* committed by writer, could get reopened */ + desc_finalized = 0x2, /* committed, no further modification allowed */ desc_reusable = 0x3, /* free, not yet used by any writer */ }; @@ -327,7 +328,10 @@ static inline void prb_rec_init_wr(struct printk_record *r, bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id); void prb_commit(struct prb_reserved_entry *e); +void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py index dd8c0b95063a..bce14de5f610 100644 --- a/scripts/gdb/linux/dmesg.py +++ b/scripts/gdb/linux/dmesg.py @@ -79,6 +79,7 @@ class LxDmesg(gdb.Command): # definitions from kernel/printk/printk_ringbuffer.h desc_committed = 1 + desc_finalized = 2 desc_sv_bits = utils.get_long_type().sizeof * 8 desc_flags_shift = desc_sv_bits - 2 desc_flags_mask = 3 << desc_flags_shift @@ -98,7 +99,7 @@ class LxDmesg(gdb.Command): # skip non-committed record state = 3 & (utils.read_u64(descs, desc_off + sv_off + counter_off) >> desc_flags_shift) - if state != desc_committed: + if state != desc_committed and state != desc_finalized: if did == head_id: break did = (did + 1) & desc_id_mask -- cgit v1.2.3 From f5f022e53b874f978dda23847173cbf2589b07f5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:54 +0206 Subject: printk: reimplement log_cont using record extension Use the record extending feature of the ringbuffer to implement continuous messages. This preserves the existing continuous message behavior. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-7-john.ogness@linutronix.de --- kernel/printk/printk.c | 98 +++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 78 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 964b5701688f..9a2e23191576 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -535,7 +535,10 @@ static int log_store(u32 caller_id, int facility, int level, r.info->caller_id = caller_id; /* insert message */ - prb_commit(&e); + if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + prb_commit(&e); + else + prb_final_commit(&e); return (text_len + trunc_msg_len); } @@ -1084,7 +1087,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; - prb_commit(&e); + prb_final_commit(&e); return prb_record_text_space(&e); } @@ -1884,87 +1887,26 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -/* - * Continuation lines are buffered, and not committed to the record buffer - * until the line is complete, or a race forces it. The line fragments - * though, are printed immediately to the consoles to ensure everything has - * reached the console in case of a kernel crash. - */ -static struct cont { - char buf[LOG_LINE_MAX]; - size_t len; /* length == 0 means unused buffer */ - u32 caller_id; /* printk_caller_id() of first print */ - u64 ts_nsec; /* time of first print */ - u8 level; /* log level of first message */ - u8 facility; /* log facility of first message */ - enum log_flags flags; /* prefix, newline flags */ -} cont; - -static void cont_flush(void) -{ - if (cont.len == 0) - return; - - log_store(cont.caller_id, cont.facility, cont.level, cont.flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.len = 0; -} - -static bool cont_add(u32 caller_id, int facility, int level, - enum log_flags flags, const char *text, size_t len) -{ - /* If the line gets too long, split it up in separate records. */ - if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); - return false; - } - - if (!cont.len) { - cont.facility = facility; - cont.level = level; - cont.caller_id = caller_id; - cont.ts_nsec = local_clock(); - cont.flags = flags; - } - - memcpy(cont.buf + cont.len, text, len); - cont.len += len; - - // The original flags come from the first line, - // but later continuations can add a newline. - if (flags & LOG_NEWLINE) { - cont.flags |= LOG_NEWLINE; - cont_flush(); - } - - return true; -} - static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); - /* - * If an earlier line was buffered, and we're a continuation - * write from the same context, try to add it to the buffer. - */ - if (cont.len) { - if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) - return text_len; - } - /* Otherwise, make sure it's flushed */ - cont_flush(); - } - - /* Skip empty continuation lines that couldn't be added - they just flush */ - if (!text_len && (lflags & LOG_CONT)) - return 0; - - /* If it doesn't end in a newline, try to buffer the current line */ - if (!(lflags & LOG_NEWLINE)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) + if (lflags & LOG_CONT) { + struct prb_reserved_entry e; + struct printk_record r; + + prb_rec_init_wr(&r, text_len, 0); + if (prb_reserve_in_last(&e, prb, &r, caller_id)) { + memcpy(&r.text_buf[r.info->text_len], text, text_len); + r.info->text_len += text_len; + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + } else { + prb_commit(&e); + } return text_len; + } } /* Store it in the record log */ -- cgit v1.2.3 From cfe2790b163acdc9c058a63bff310923e84a16b4 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sat, 19 Sep 2020 00:40:19 +0206 Subject: printk: move printk_info into separate array The majority of the size of a descriptor is taken up by meta data, which is often not of interest to the ringbuffer (for example, when performing state checks). Since descriptors are often temporarily stored on the stack, keeping their size minimal will help reduce stack pressure. Rather than embedding the printk_info into the descriptor, create a separate printk_info array. The index of a descriptor in the descriptor array corresponds to the printk_info with the same index in the printk_info array. The rules for validity of a printk_info match the existing rules for the data blocks: the descriptor must be in a consistent state. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200918223421.21621-2-john.ogness@linutronix.de --- kernel/printk/printk.c | 30 ++++++-- kernel/printk/printk_ringbuffer.c | 145 ++++++++++++++++++++++++-------------- kernel/printk/printk_ringbuffer.h | 29 ++++---- 3 files changed, 133 insertions(+), 71 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a2e23191576..25cfe4fe48af 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -959,11 +959,11 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); - VMCOREINFO_OFFSET(prb_desc, info); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_OFFSET(prb_desc, dict_blk_lpos); @@ -1097,11 +1097,13 @@ static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] __initdata; void __init setup_log_buf(int early) { + struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; size_t new_descs_size; + size_t new_infos_size; unsigned long flags; char *new_dict_buf; char *new_log_buf; @@ -1142,8 +1144,7 @@ void __init setup_log_buf(int early) if (unlikely(!new_dict_buf)) { pr_err("log_buf_len: %lu dict bytes not available\n", new_log_buf_len); - memblock_free(__pa(new_log_buf), new_log_buf_len); - return; + goto err_free_log_buf; } new_descs_size = new_descs_count * sizeof(struct prb_desc); @@ -1151,9 +1152,15 @@ void __init setup_log_buf(int early) if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); - memblock_free(__pa(new_dict_buf), new_log_buf_len); - memblock_free(__pa(new_log_buf), new_log_buf_len); - return; + goto err_free_dict_buf; + } + + new_infos_size = new_descs_count * sizeof(struct printk_info); + new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); + if (unlikely(!new_infos)) { + pr_err("log_buf_len: %zu info bytes not available\n", + new_infos_size); + goto err_free_descs; } prb_rec_init_rd(&r, &info, @@ -1163,7 +1170,8 @@ void __init setup_log_buf(int early) prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_dict_buf, ilog2(new_log_buf_len), - new_descs, ilog2(new_descs_count)); + new_descs, ilog2(new_descs_count), + new_infos); logbuf_lock_irqsave(flags); @@ -1192,6 +1200,14 @@ void __init setup_log_buf(int early) pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); + return; + +err_free_descs: + memblock_free(__pa(new_descs), new_descs_size); +err_free_dict_buf: + memblock_free(__pa(new_dict_buf), new_log_buf_len); +err_free_log_buf: + memblock_free(__pa(new_log_buf), new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index f4e2e9890e0f..de4b10a98623 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -15,10 +15,10 @@ * The printk_ringbuffer is made up of 3 internal ringbuffers: * * desc_ring - * A ring of descriptors. A descriptor contains all record meta data - * (sequence number, timestamp, loglevel, etc.) as well as internal state - * information about the record and logical positions specifying where in - * the other ringbuffers the text and dictionary strings are located. + * A ring of descriptors and their meta data (such as sequence number, + * timestamp, loglevel, etc.) as well as internal state information about + * the record and logical positions specifying where in the other + * ringbuffers the text and dictionary strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long @@ -38,13 +38,14 @@ * * Descriptor Ring * ~~~~~~~~~~~~~~~ - * The descriptor ring is an array of descriptors. A descriptor contains all - * the meta data of a printk record as well as blk_lpos structs pointing to - * associated text and dictionary data blocks (see "Data Rings" below). Each - * descriptor is assigned an ID that maps directly to index values of the - * descriptor array and has a state. The ID and the state are bitwise combined - * into a single descriptor field named @state_var, allowing ID and state to - * be synchronously and atomically updated. + * The descriptor ring is an array of descriptors. A descriptor contains + * essential meta data to track the data of a printk record using + * blk_lpos structs pointing to associated text and dictionary data blocks + * (see "Data Rings" below). Each descriptor is assigned an ID that maps + * directly to index values of the descriptor array and has a state. The ID + * and the state are bitwise combined into a single descriptor field named + * @state_var, allowing ID and state to be synchronously and atomically + * updated. * * Descriptors have four states: * @@ -150,6 +151,14 @@ * descriptor. If a data block is not valid, the @tail_lpos cannot be * advanced beyond it. * + * Info Array + * ~~~~~~~~~~ + * The general meta data of printk records are stored in printk_info structs, + * stored in an array with the same number of elements as the descriptor ring. + * Each info corresponds to the descriptor of the same index in the + * descriptor ring. Info validity is confirmed by evaluating the corresponding + * descriptor before and after loading the info. + * * Usage * ----- * Here are some simple examples demonstrating writers and readers. For the @@ -367,6 +376,15 @@ static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; } +/* + * Return the printk_info associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; +} + static struct prb_data_block *to_block(struct prb_data_ring *data_ring, unsigned long begin_lpos) { @@ -425,10 +443,16 @@ static enum desc_state get_desc_state(unsigned long id, * Get a copy of a specified descriptor and return its queried state. If the * descriptor is in an inconsistent state (miss or reserved), the caller can * only expect the descriptor's @state_var field to be valid. + * + * The sequence number and caller_id can be optionally retrieved. Like all + * non-state_var data, they are only valid if the descriptor is in a + * consistent state. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, - unsigned long id, struct prb_desc *desc_out) + unsigned long id, struct prb_desc *desc_out, + u64 *seq_out, u32 *caller_id_out) { + struct printk_info *info = to_info(desc_ring, id); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; enum desc_state d_state; @@ -469,11 +493,14 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ - memcpy(&desc_out->info, &desc->info, sizeof(desc_out->info)); /* LMM(desc_read:C) */ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, - sizeof(desc_out->text_blk_lpos)); /* also part of desc_read:C */ + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ memcpy(&desc_out->dict_blk_lpos, &desc->dict_blk_lpos, sizeof(desc_out->dict_blk_lpos)); /* also part of desc_read:C */ + if (seq_out) + *seq_out = info->seq; /* also part of desc_read:C */ + if (caller_id_out) + *caller_id_out = info->caller_id; /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking @@ -588,7 +615,8 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, */ id = blk->id; /* LMM(data_make_reusable:A) */ - d_state = desc_read(desc_ring, id, &desc); /* LMM(data_make_reusable:B) */ + d_state = desc_read(desc_ring, id, &desc, + NULL, NULL); /* LMM(data_make_reusable:B) */ switch (d_state) { case desc_miss: @@ -771,7 +799,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, enum desc_state d_state; struct prb_desc desc; - d_state = desc_read(desc_ring, tail_id, &desc); + d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); switch (d_state) { case desc_miss: @@ -823,7 +851,8 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, * equal to @head_id so there is no risk of pushing the tail past the * head. */ - d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc); /* LMM(desc_push_tail:A) */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, + NULL, NULL); /* LMM(desc_push_tail:A) */ if (d_state == desc_finalized || d_state == desc_reusable) { /* @@ -1264,6 +1293,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, struct prb_desc desc; struct prb_desc *d; unsigned long id; + u32 cid; id = atomic_long_read(&desc_ring->head_id); @@ -1271,8 +1301,8 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * To reduce unnecessarily reopening, first check if the descriptor * state and caller ID are correct. */ - d_state = desc_read(desc_ring, id, &desc); - if (d_state != desc_committed || desc.info.caller_id != caller_id) + d_state = desc_read(desc_ring, id, &desc, NULL, &cid); + if (d_state != desc_committed || cid != caller_id) return NULL; d = to_desc(desc_ring, id); @@ -1353,6 +1383,8 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id) { + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; unsigned int data_size; struct prb_desc *d; unsigned long id; @@ -1360,7 +1392,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer local_irq_save(e->irqflags); /* Transition the newest descriptor back to the reserved state. */ - d = desc_reopen_last(&rb->desc_ring, caller_id, &id); + d = desc_reopen_last(desc_ring, caller_id, &id); if (!d) { local_irq_restore(e->irqflags); goto fail_reopen; @@ -1368,6 +1400,8 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + info = to_info(desc_ring, id); + /* * Set the @e fields here so that prb_commit() can be used if * anything fails from now on. @@ -1380,14 +1414,14 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer * exclusive access at that point. The descriptor may have * changed since then. */ - if (caller_id != d->info.caller_id) + if (caller_id != info->caller_id) goto fail; if (BLK_DATALESS(&d->text_blk_lpos)) { - if (WARN_ON_ONCE(d->info.text_len != 0)) { + if (WARN_ON_ONCE(info->text_len != 0)) { pr_warn_once("wrong text_len value (%hu, expecting 0)\n", - d->info.text_len); - d->info.text_len = 0; + info->text_len); + info->text_len = 0; } if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) @@ -1404,12 +1438,12 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer * the meta data (@text_len) is not sane, use the full data * block size. */ - if (WARN_ON_ONCE(d->info.text_len > data_size)) { + if (WARN_ON_ONCE(info->text_len > data_size)) { pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", - d->info.text_len, data_size); - d->info.text_len = data_size; + info->text_len, data_size); + info->text_len = data_size; } - r->text_buf_size += d->info.text_len; + r->text_buf_size += info->text_len; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; @@ -1424,7 +1458,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer r->dict_buf = NULL; r->dict_buf_size = 0; - r->info = &d->info; + r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); @@ -1486,6 +1520,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; struct prb_desc *d; unsigned long id; u64 seq; @@ -1512,14 +1547,15 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, } d = to_desc(desc_ring, id); + info = to_info(desc_ring, id); /* * All @info fields (except @seq) are cleared and must be filled in * by the writer. Save @seq before clearing because it is used to * determine the new sequence number. */ - seq = d->info.seq; - memset(&d->info, 0, sizeof(d->info)); + seq = info->seq; + memset(info, 0, sizeof(*info)); /* * Set the @e fields here so that prb_commit() can be used if @@ -1533,16 +1569,16 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * Otherwise just increment it by a full wrap. * * @seq is considered "never been set" if it has a value of 0, - * _except_ for @descs[0], which was specially setup by the ringbuffer + * _except_ for @infos[0], which was specially setup by the ringbuffer * initializer and therefore is always considered as set. * * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) - d->info.seq = DESC_INDEX(desc_ring, id); + info->seq = DESC_INDEX(desc_ring, id); else - d->info.seq = seq + DESCS_COUNT(desc_ring); + info->seq = seq + DESCS_COUNT(desc_ring); /* * New data is about to be reserved. Once that happens, previous @@ -1550,7 +1586,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * previous descriptor now so that it can be made available to * readers. (For seq==0 there is no previous descriptor.) */ - if (d->info.seq > 0) + if (info->seq > 0) desc_make_final(desc_ring, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, @@ -1571,7 +1607,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (r->dict_buf_size && !r->dict_buf) r->dict_buf_size = 0; - r->info = &d->info; + r->info = info; /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); @@ -1726,12 +1762,12 @@ static bool copy_data(struct prb_data_ring *data_ring, /* * Actual cannot be less than expected. It can be more than expected * because of the trailing alignment padding. + * + * Note that invalid @len values can occur because the caller loads + * the value during an allowed data race. */ - if (WARN_ON_ONCE(data_size < (unsigned int)len)) { - pr_warn_once("wrong data size (%u, expecting >=%hu) for data: %.*s\n", - data_size, len, data_size, data); + if (data_size < (unsigned int)len) return false; - } /* Caller interested in the line count? */ if (line_count) @@ -1764,8 +1800,9 @@ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, { struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; enum desc_state d_state; + u64 s; - d_state = desc_read(desc_ring, id, desc_out); + d_state = desc_read(desc_ring, id, desc_out, &s, NULL); /* * An unexpected @id (desc_miss) or @seq mismatch means the record @@ -1775,7 +1812,7 @@ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, if (d_state == desc_miss || d_state == desc_reserved || d_state == desc_committed || - desc_out->info.seq != seq) { + s != seq) { return -EINVAL; } @@ -1802,6 +1839,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) { struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info = to_info(desc_ring, seq); struct prb_desc *rdesc = to_desc(desc_ring, seq); atomic_long_t *state_var = &rdesc->state_var; struct prb_desc desc; @@ -1823,10 +1861,10 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, /* If requested, copy meta data. */ if (r->info) - memcpy(r->info, &desc.info, sizeof(*(r->info))); + memcpy(r->info, info, sizeof(*(r->info))); /* Copy text data. If it fails, this is a data-less record. */ - if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, desc.info.text_len, + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, r->text_buf, r->text_buf_size, line_count)) { return -ENOENT; } @@ -1836,7 +1874,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, * important. So if it fails, modify the copied meta data to report * that there is no dict data, thus silently dropping the dict data. */ - if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, desc.info.dict_len, + if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, info->dict_len, r->dict_buf, r->dict_buf_size, NULL)) { if (r->info) r->info->dict_len = 0; @@ -1853,11 +1891,12 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) enum desc_state d_state; struct prb_desc desc; unsigned long id; + u64 seq; for (;;) { id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ - d_state = desc_read(desc_ring, id, &desc); /* LMM(prb_first_seq:B) */ + d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ /* * This loop will not be infinite because the tail is @@ -1886,7 +1925,7 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) smp_rmb(); /* LMM(prb_first_seq:C) */ } - return desc.info.seq; + return seq; } /* @@ -2049,6 +2088,7 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) * @dictbits: The size of @dict_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. + * @infos: The printk_info buffer for ringbuffer records. * * This is the public function available to writers to setup a ringbuffer * during runtime using provided buffers. @@ -2060,12 +2100,15 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, char *dict_buf, unsigned int dictbits, - struct prb_desc *descs, unsigned int descbits) + struct prb_desc *descs, unsigned int descbits, + struct printk_info *infos) { memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); rb->desc_ring.count_bits = descbits; rb->desc_ring.descs = descs; + rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); @@ -2081,14 +2124,14 @@ void prb_init(struct printk_ringbuffer *rb, atomic_long_set(&rb->fail, 0); - descs[0].info.seq = -(u64)_DESCS_COUNT(descbits); - - descs[_DESCS_COUNT(descbits) - 1].info.seq = 0; atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = FAILED_LPOS; + + infos[0].seq = -(u64)_DESCS_COUNT(descbits); + infos[_DESCS_COUNT(descbits) - 1].seq = 0; } /** diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 853ea62dc5f2..97c8561e74e0 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -58,7 +58,6 @@ struct prb_data_blk_lpos { * @state_var: A bitwise combination of descriptor ID and descriptor state. */ struct prb_desc { - struct printk_info info; atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; struct prb_data_blk_lpos dict_blk_lpos; @@ -76,6 +75,7 @@ struct prb_data_ring { struct prb_desc_ring { unsigned int count_bits; struct prb_desc *descs; + struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; }; @@ -237,19 +237,8 @@ enum desc_state { static char _##name##_dict[1U << ((avgdictbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ - /* this will be the first record reserved by a writer */ \ - [0] = { \ - .info = { \ - /* will be incremented to 0 on the first reservation */ \ - .seq = -(u64)_DESCS_COUNT(descbits), \ - }, \ - }, \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ - .info = { \ - /* reports the first seq value during the bootstrap phase */ \ - .seq = 0, \ - }, \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ @@ -257,10 +246,23 @@ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ .dict_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ +static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ +}; \ static struct printk_ringbuffer name = { \ .desc_ring = { \ .count_bits = descbits, \ .descs = &_##name##_descs[0], \ + .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ @@ -336,7 +338,8 @@ void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, char *dict_buf, unsigned int dict_buf_size, - struct prb_desc *descs, unsigned int descs_count_bits); + struct prb_desc *descs, unsigned int descs_count_bits, + struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); /* Reader Interface */ -- cgit v1.2.3 From 74caba7f2a0685575b3ee5330a118f5922485e02 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 21 Sep 2020 13:24:45 +0206 Subject: printk: move dictionary keys to dev_printk_info Dictionaries are only used for SUBSYSTEM and DEVICE properties. The current implementation stores the property names each time they are used. This requires more space than otherwise necessary. Also, because the dictionary entries are currently considered optional, it cannot be relied upon that they are always available, even if the writer wanted to store them. These issues will increase should new dictionary properties be introduced. Rather than storing the subsystem and device properties in the dict ring, introduce a struct dev_printk_info with separate fields to store only the property values. Embed this struct within the struct printk_info to provide guaranteed availability. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/87mu1jl6ne.fsf@jogness.linutronix.de --- Documentation/admin-guide/kdump/gdbmacros.txt | 73 +++++------ drivers/base/core.c | 46 +++---- include/linux/dev_printk.h | 8 ++ include/linux/printk.h | 6 +- kernel/printk/internal.h | 4 +- kernel/printk/printk.c | 166 +++++++++++++------------- kernel/printk/printk_ringbuffer.h | 3 + kernel/printk/printk_safe.c | 2 +- scripts/gdb/linux/dmesg.py | 16 ++- 9 files changed, 164 insertions(+), 160 deletions(-) (limited to 'kernel/printk') diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt index 94fabb165abf..82aecdcae8a6 100644 --- a/Documentation/admin-guide/kdump/gdbmacros.txt +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -172,13 +172,13 @@ end define dump_record set var $desc = $arg0 - if ($argc > 1) - set var $prev_flags = $arg1 + set var $info = $arg1 + if ($argc > 2) + set var $prev_flags = $arg2 else set var $prev_flags = 0 end - set var $info = &$desc->info set var $prefix = 1 set var $newline = 1 @@ -237,44 +237,36 @@ define dump_record # handle dictionary data - set var $begin = $desc->dict_blk_lpos.begin % (1U << prb->dict_data_ring.size_bits) - set var $next = $desc->dict_blk_lpos.next % (1U << prb->dict_data_ring.size_bits) - - # handle data-less record - if ($begin & 1) - set var $dict_len = 0 - set var $dict = "" - else - # handle wrapping data block - if ($begin > $next) - set var $begin = 0 - end - - # skip over descriptor id - set var $begin = $begin + sizeof(long) - - # handle truncated message - if ($next - $begin < $info->dict_len) - set var $dict_len = $next - $begin - else - set var $dict_len = $info->dict_len + set var $dict = &$info->dev_info.subsystem[0] + set var $dict_len = sizeof($info->dev_info.subsystem) + if ($dict[0] != '\0') + printf " SUBSYSTEM=" + set var $idx = 0 + while ($idx < $dict_len) + set var $c = $dict[$idx] + if ($c == '\0') + loop_break + else + if ($c < ' ' || $c >= 127 || $c == '\\') + printf "\\x%02x", $c + else + printf "%c", $c + end + end + set var $idx = $idx + 1 end - - set var $dict = &prb->dict_data_ring.data[$begin] + printf "\n" end - if ($dict_len > 0) + set var $dict = &$info->dev_info.device[0] + set var $dict_len = sizeof($info->dev_info.device) + if ($dict[0] != '\0') + printf " DEVICE=" set var $idx = 0 - set var $line = 1 while ($idx < $dict_len) - if ($line) - printf " " - set var $line = 0 - end set var $c = $dict[$idx] if ($c == '\0') - printf "\n" - set var $line = 1 + loop_break else if ($c < ' ' || $c >= 127 || $c == '\\') printf "\\x%02x", $c @@ -288,10 +280,10 @@ define dump_record end end document dump_record - Dump a single record. The first parameter is the descriptor - sequence number, the second is optional and specifies the - previous record's flags, used for properly formatting - continued lines. + Dump a single record. The first parameter is the descriptor, + the second parameter is the info, the third parameter is + optional and specifies the previous record's flags, used for + properly formatting continued lines. end define dmesg @@ -311,12 +303,13 @@ define dmesg while (1) set var $desc = &prb->desc_ring.descs[$id % $desc_count] + set var $info = &prb->desc_ring.infos[$id % $desc_count] # skip non-committed record set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift) if ($state == $desc_committed || $state == $desc_finalized) - dump_record $desc $prev_flags - set var $prev_flags = $desc->info.flags + dump_record $desc $info $prev_flags + set var $prev_flags = $info->flags end set var $id = ($id + 1) & $id_mask diff --git a/drivers/base/core.c b/drivers/base/core.c index 67d39a90b45c..22d83aedb64e 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3815,22 +3815,21 @@ void device_shutdown(void) */ #ifdef CONFIG_PRINTK -static int -create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) +static void +set_dev_info(const struct device *dev, struct dev_printk_info *dev_info) { const char *subsys; - size_t pos = 0; + + memset(dev_info, 0, sizeof(*dev_info)); if (dev->class) subsys = dev->class->name; else if (dev->bus) subsys = dev->bus->name; else - return 0; + return; - pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys); - if (pos >= hdrlen) - goto overflow; + strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem)); /* * Add device identifier DEVICE=: @@ -3846,41 +3845,28 @@ create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen) c = 'b'; else c = 'c'; - pos++; - pos += snprintf(hdr + pos, hdrlen - pos, - "DEVICE=%c%u:%u", - c, MAJOR(dev->devt), MINOR(dev->devt)); + + snprintf(dev_info->device, sizeof(dev_info->device), + "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt)); } else if (strcmp(subsys, "net") == 0) { struct net_device *net = to_net_dev(dev); - pos++; - pos += snprintf(hdr + pos, hdrlen - pos, - "DEVICE=n%u", net->ifindex); + snprintf(dev_info->device, sizeof(dev_info->device), + "n%u", net->ifindex); } else { - pos++; - pos += snprintf(hdr + pos, hdrlen - pos, - "DEVICE=+%s:%s", subsys, dev_name(dev)); + snprintf(dev_info->device, sizeof(dev_info->device), + "+%s:%s", subsys, dev_name(dev)); } - - if (pos >= hdrlen) - goto overflow; - - return pos; - -overflow: - dev_WARN(dev, "device/subsystem name too long"); - return 0; } int dev_vprintk_emit(int level, const struct device *dev, const char *fmt, va_list args) { - char hdr[128]; - size_t hdrlen; + struct dev_printk_info dev_info; - hdrlen = create_syslog_header(dev, hdr, sizeof(hdr)); + set_dev_info(dev, &dev_info); - return vprintk_emit(0, level, hdrlen ? hdr : NULL, hdrlen, fmt, args); + return vprintk_emit(0, level, &dev_info, fmt, args); } EXPORT_SYMBOL(dev_vprintk_emit); diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 3028b644b4fb..6f009559ee54 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -21,6 +21,14 @@ struct device; +#define PRINTK_INFO_SUBSYSTEM_LEN 16 +#define PRINTK_INFO_DEVICE_LEN 48 + +struct dev_printk_info { + char subsystem[PRINTK_INFO_SUBSYSTEM_LEN]; + char device[PRINTK_INFO_DEVICE_LEN]; +}; + #ifdef CONFIG_PRINTK __printf(3, 0) __cold diff --git a/include/linux/printk.h b/include/linux/printk.h index fc8f03c54543..071500ee7281 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -158,10 +158,12 @@ static inline void printk_nmi_direct_enter(void) { } static inline void printk_nmi_direct_exit(void) { } #endif /* PRINTK_NMI */ +struct dev_printk_info; + #ifdef CONFIG_PRINTK -asmlinkage __printf(5, 0) +asmlinkage __printf(4, 0) int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); asmlinkage __printf(1, 0) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 660f9a6bf73a..3a8fd491758c 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,9 +14,9 @@ extern raw_spinlock_t logbuf_lock; -__printf(5, 0) +__printf(4, 0) int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 25cfe4fe48af..f1e243cc284a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -296,8 +296,8 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each - * containing variable length message and dictionary text. Every record - * also contains its own meta-data (@info). + * containing variable length message text. Every record also contains its + * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel @@ -310,9 +310,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * terminated. * * Optionally, a record can carry a dictionary of properties (key/value - * pairs), to provide userspace with a machine-readable message context. The - * length of the dictionary is available in @dict_len. The dictionary is not - * terminated. + * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -322,21 +320,20 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * - * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value - * follows directly after a '=' character. Every property is terminated by - * a '\0' character. The last property is not terminated. + * Valid characters in property names are [a-zA-Z0-9.-_]. Property names + * and values are terminated by a '\0' character. * * Example of record values: - * record.text_buf = "it's a line" (unterminated) - * record.dict_buf = "DEVICE=b8:2\0DRIVER=bug" (unterminated) - * record.info.seq = 56 - * record.info.ts_nsec = 36863 - * record.info.text_len = 11 - * record.info.dict_len = 22 - * record.info.facility = 0 (LOG_KERN) - * record.info.flags = 0 - * record.info.level = 3 (LOG_ERR) - * record.info.caller_id = 299 (task 299) + * record.text_buf = "it's a line" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * record.info.dev_info.subsystem = "pci" (terminated) + * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might @@ -498,19 +495,19 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) /* insert record into the buffer, discard old ones, update heads */ static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, + const struct dev_printk_info *dev_info, const char *text, u16 text_len) { struct prb_reserved_entry e; struct printk_record r; u16 trunc_msg_len = 0; - prb_rec_init_wr(&r, text_len, dict_len); + prb_rec_init_wr(&r, text_len, 0); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&text_len, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len, dict_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len, 0); /* survive when the log buffer is too small for trunc_msg */ if (!prb_reserve(&e, prb, &r)) return 0; @@ -521,10 +518,6 @@ static int log_store(u32 caller_id, int facility, int level, if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; - if (r.dict_buf) { - memcpy(&r.dict_buf[0], dict, dict_len); - r.info->dict_len = dict_len; - } r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; @@ -533,6 +526,8 @@ static int log_store(u32 caller_id, int facility, int level, else r.info->ts_nsec = local_clock(); r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* insert message */ if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) @@ -613,9 +608,9 @@ static ssize_t info_print_ext_header(char *buf, size_t size, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) +static ssize_t msg_add_ext_text(char *buf, size_t size, + const char *text, size_t text_len, + unsigned char endc) { char *p = buf, *e = buf + size; size_t i; @@ -629,36 +624,44 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, else append_char(&p, e, c); } - append_char(&p, e, '\n'); + append_char(&p, e, endc); - if (dict_len) { - bool line = true; + return p - buf; +} - for (i = 0; i < dict_len; i++) { - unsigned char c = dict[i]; +static ssize_t msg_add_dict_text(char *buf, size_t size, + const char *key, const char *val) +{ + size_t val_len = strlen(val); + ssize_t len; - if (line) { - append_char(&p, e, ' '); - line = false; - } + if (!val_len) + return 0; - if (c == '\0') { - append_char(&p, e, '\n'); - line = true; - continue; - } + len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ + len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); + len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); - if (c < ' ' || c >= 127 || c == '\\') { - p += scnprintf(p, e - p, "\\x%02x", c); - continue; - } + return len; +} - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *text, size_t text_len, + struct dev_printk_info *dev_info) +{ + ssize_t len; - return p - buf; + len = msg_add_ext_text(buf, size, text, text_len, '\n'); + + if (!dev_info) + goto out; + + len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", + dev_info->subsystem); + len += msg_add_dict_text(buf + len, size - len, "DEVICE", + dev_info->device); +out: + return len; } /* /dev/kmsg - userspace message inject/listen interface */ @@ -670,7 +673,6 @@ struct devkmsg_user { struct printk_info info; char text_buf[CONSOLE_EXT_LOG_MAX]; - char dict_buf[CONSOLE_EXT_LOG_MAX]; struct printk_record record; }; @@ -681,7 +683,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_emit(facility, level, NULL, 0, fmt, args); + r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; @@ -791,8 +793,8 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - &r->dict_buf[0], r->info->dict_len, - &r->text_buf[0], r->info->text_len); + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); user->seq = r->info->seq + 1; logbuf_unlock_irq(); @@ -897,7 +899,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf), - &user->dict_buf[0], sizeof(user->dict_buf)); + NULL, 0); logbuf_lock_irq(); user->seq = prb_first_valid_seq(prb); @@ -941,6 +943,8 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { + struct dev_printk_info *dev_info = NULL; + VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); @@ -978,6 +982,13 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, dict_len); VMCOREINFO_OFFSET(printk_info, caller_id); + VMCOREINFO_OFFSET(printk_info, dev_info); + + VMCOREINFO_STRUCT_SIZE(dev_printk_info); + VMCOREINFO_OFFSET(dev_printk_info, subsystem); + VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); + VMCOREINFO_OFFSET(dev_printk_info, device); + VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); @@ -1070,22 +1081,19 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct prb_reserved_entry e; struct printk_record dest_r; - prb_rec_init_wr(&dest_r, r->info->text_len, r->info->dict_len); + prb_rec_init_wr(&dest_r, r->info->text_len, 0); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; - if (dest_r.dict_buf) { - memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], r->info->dict_len); - dest_r.info->dict_len = r->info->dict_len; - } dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; + memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); @@ -1093,7 +1101,6 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, } static char setup_text_buf[CONSOLE_EXT_LOG_MAX] __initdata; -static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] __initdata; void __init setup_log_buf(int early) { @@ -1165,7 +1172,7 @@ void __init setup_log_buf(int early) prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf), - &setup_dict_buf[0], sizeof(setup_dict_buf)); + NULL, 0); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), @@ -1903,7 +1910,9 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +static size_t log_output(int facility, int level, enum log_flags lflags, + const struct dev_printk_info *dev_info, + char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); @@ -1927,12 +1936,12 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c /* Store it in the record log */ return log_store(caller_id, facility, level, lflags, 0, - dict, dictlen, text, text_len); + dev_info, text, text_len); } /* Must be called under logbuf_lock. */ int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; @@ -1974,15 +1983,14 @@ int vprintk_store(int facility, int level, if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; - if (dict) + if (dev_info) lflags |= LOG_NEWLINE; - return log_output(facility, level, lflags, - dict, dictlen, text, text_len); + return log_output(facility, level, lflags, dev_info, text, text_len); } asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { int printed_len; @@ -2003,7 +2011,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ @@ -2037,7 +2045,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -2100,8 +2108,8 @@ static ssize_t info_print_ext_header(char *buf, size_t size, return 0; } static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) { return 0; } + char *text, size_t text_len, + struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, @@ -2390,7 +2398,6 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; - static char dict[LOG_LINE_MAX]; unsigned long flags; bool do_cond_resched, retry; struct printk_info info; @@ -2401,7 +2408,7 @@ void console_unlock(void) return; } - prb_rec_init_rd(&r, &info, text, sizeof(text), dict, sizeof(dict)); + prb_rec_init_rd(&r, &info, text, sizeof(text), NULL, 0); /* * Console drivers are called with interrupts disabled, so @@ -2473,10 +2480,9 @@ skip: r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - &r.dict_buf[0], - r.info->dict_len, &r.text_buf[0], - r.info->text_len); + r.info->text_len, + &r.info->dev_info); } len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, @@ -3055,7 +3061,7 @@ int vprintk_deferred(const char *fmt, va_list args) { int r; - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); defer_console_output(); return r; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 97c8561e74e0..480499ce3c6b 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -4,6 +4,7 @@ #define _KERNEL_PRINTK_RINGBUFFER_H #include +#include /* * Meta information about each stored message. @@ -21,6 +22,8 @@ struct printk_info { u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ + + struct dev_printk_info dev_info; }; /* diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 50aeae770434..5dbc40160990 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -375,7 +375,7 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) raw_spin_trylock(&logbuf_lock)) { int len; - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); raw_spin_unlock(&logbuf_lock); defer_console_output(); return len; diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py index bce14de5f610..a92c55bd8de5 100644 --- a/scripts/gdb/linux/dmesg.py +++ b/scripts/gdb/linux/dmesg.py @@ -52,6 +52,12 @@ class LxDmesg(gdb.Command): addr = utils.read_ulong(desc_ring, off) descs = utils.read_memoryview(inf, addr, desc_sz * desc_ring_count).tobytes() + # read in info array + info_sz = printk_info_type.get_type().sizeof + off = prb_desc_ring_type.get_type()['infos'].bitpos // 8 + addr = utils.read_ulong(desc_ring, off) + infos = utils.read_memoryview(inf, addr, info_sz * desc_ring_count).tobytes() + # read in text data ring structure off = printk_ringbuffer_type.get_type()['text_data_ring'].bitpos // 8 addr = prb_addr + off @@ -73,9 +79,8 @@ class LxDmesg(gdb.Command): begin_off = off + (prb_data_blk_lpos_type.get_type()['begin'].bitpos // 8) next_off = off + (prb_data_blk_lpos_type.get_type()['next'].bitpos // 8) - off = prb_desc_type.get_type()['info'].bitpos // 8 - ts_off = off + printk_info_type.get_type()['ts_nsec'].bitpos // 8 - len_off = off + printk_info_type.get_type()['text_len'].bitpos // 8 + ts_off = printk_info_type.get_type()['ts_nsec'].bitpos // 8 + len_off = printk_info_type.get_type()['text_len'].bitpos // 8 # definitions from kernel/printk/printk_ringbuffer.h desc_committed = 1 @@ -95,6 +100,7 @@ class LxDmesg(gdb.Command): while True: ind = did % desc_ring_count desc_off = desc_sz * ind + info_off = info_sz * ind # skip non-committed record state = 3 & (utils.read_u64(descs, desc_off + sv_off + @@ -119,7 +125,7 @@ class LxDmesg(gdb.Command): # skip over descriptor id text_start = begin + utils.get_long_type().sizeof - text_len = utils.read_u16(descs, desc_off + len_off) + text_len = utils.read_u16(infos, info_off + len_off) # handle truncated message if end - text_start < text_len: @@ -128,7 +134,7 @@ class LxDmesg(gdb.Command): text = text_data[text_start:text_start + text_len].decode( encoding='utf8', errors='replace') - time_stamp = utils.read_u64(descs, desc_off + ts_off) + time_stamp = utils.read_u64(infos, info_off + ts_off) for line in text.splitlines(): msg = u"[{time:12.6f}] {line}\n".format( -- cgit v1.2.3 From f35efc78add6439a9fbe611f2dd517641e82d067 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sat, 19 Sep 2020 00:40:21 +0206 Subject: printk: remove dict ring Since there is no code that will ever store anything into the dict ring, remove it. If any future dictionary properties are to be added, these should be added to the struct printk_info. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200918223421.21621-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 46 ++++------- kernel/printk/printk_ringbuffer.c | 155 +++++++++----------------------------- kernel/printk/printk_ringbuffer.h | 63 ++++------------ 3 files changed, 64 insertions(+), 200 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f1e243cc284a..1fe3d0cb2fe0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -427,7 +427,6 @@ static u32 log_buf_len = __LOG_BUF_LEN; * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). - * The dictionary buffer will be the same size as the text buffer. */ #define PRB_AVGBITS 5 /* 32 character average length */ @@ -435,7 +434,7 @@ static u32 log_buf_len = __LOG_BUF_LEN; #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); + PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; @@ -502,12 +501,12 @@ static int log_store(u32 caller_id, int facility, int level, struct printk_record r; u16 trunc_msg_len = 0; - prb_rec_init_wr(&r, text_len, 0); + prb_rec_init_wr(&r, text_len); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&text_len, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len, 0); + prb_rec_init_wr(&r, text_len + trunc_msg_len); /* survive when the log buffer is too small for trunc_msg */ if (!prb_reserve(&e, prb, &r)) return 0; @@ -898,8 +897,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); prb_rec_init_rd(&user->record, &user->info, - &user->text_buf[0], sizeof(user->text_buf), - NULL, 0); + &user->text_buf[0], sizeof(user->text_buf)); logbuf_lock_irq(); user->seq = prb_first_valid_seq(prb); @@ -957,7 +955,6 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); - VMCOREINFO_OFFSET(printk_ringbuffer, dict_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); @@ -970,7 +967,6 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); - VMCOREINFO_OFFSET(prb_desc, dict_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); @@ -980,7 +976,6 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); - VMCOREINFO_OFFSET(printk_info, dict_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); @@ -1081,7 +1076,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct prb_reserved_entry e; struct printk_record dest_r; - prb_rec_init_wr(&dest_r, r->info->text_len, 0); + prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; @@ -1112,7 +1107,6 @@ void __init setup_log_buf(int early) size_t new_descs_size; size_t new_infos_size; unsigned long flags; - char *new_dict_buf; char *new_log_buf; unsigned int free; u64 seq; @@ -1147,19 +1141,12 @@ void __init setup_log_buf(int early) return; } - new_dict_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); - if (unlikely(!new_dict_buf)) { - pr_err("log_buf_len: %lu dict bytes not available\n", - new_log_buf_len); - goto err_free_log_buf; - } - new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); - goto err_free_dict_buf; + goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); @@ -1170,13 +1157,10 @@ void __init setup_log_buf(int early) goto err_free_descs; } - prb_rec_init_rd(&r, &info, - &setup_text_buf[0], sizeof(setup_text_buf), - NULL, 0); + prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), - new_dict_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); @@ -1211,8 +1195,6 @@ void __init setup_log_buf(int early) err_free_descs: memblock_free(__pa(new_descs), new_descs_size); -err_free_dict_buf: - memblock_free(__pa(new_dict_buf), new_log_buf_len); err_free_log_buf: memblock_free(__pa(new_log_buf), new_log_buf_len); } @@ -1463,7 +1445,7 @@ static int syslog_print(char __user *buf, int size) if (!text) return -ENOMEM; - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); while (size > 0) { size_t n; @@ -1550,7 +1532,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len -= get_record_print_text_size(&info, line_count, true, time); } - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); len = 0; prb_for_each_record(seq, prb, seq, &r) { @@ -1920,7 +1902,7 @@ static size_t log_output(int facility, int level, enum log_flags lflags, struct prb_reserved_entry e; struct printk_record r; - prb_rec_init_wr(&r, text_len, 0); + prb_rec_init_wr(&r, text_len); if (prb_reserve_in_last(&e, prb, &r, caller_id)) { memcpy(&r.text_buf[r.info->text_len], text, text_len); r.info->text_len += text_len; @@ -2408,7 +2390,7 @@ void console_unlock(void) return; } - prb_rec_init_rd(&r, &info, text, sizeof(text), NULL, 0); + prb_rec_init_rd(&r, &info, text, sizeof(text)); /* * Console drivers are called with interrupts disabled, so @@ -3266,7 +3248,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, size_t l = 0; bool ret = false; - prb_rec_init_rd(&r, &info, line, size, NULL, 0); + prb_rec_init_rd(&r, &info, line, size); if (!dumper->active) goto out; @@ -3357,7 +3339,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, bool ret = false; bool time = printk_time; - prb_rec_init_rd(&r, &info, buf, size, NULL, 0); + prb_rec_init_rd(&r, &info, buf, size); if (!dumper->active || !buf || !size) goto out; @@ -3405,7 +3387,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, l += record_print_text(&r, syslog, time); /* adjust record to store to remaining buffer space */ - prb_rec_init_rd(&r, &info, buf + l, size - l, NULL, 0); + prb_rec_init_rd(&r, &info, buf + l, size - l); seq = r.info->seq + 1; } diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index de4b10a98623..13b94b92342e 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -18,18 +18,13 @@ * A ring of descriptors and their meta data (such as sequence number, * timestamp, loglevel, etc.) as well as internal state information about * the record and logical positions specifying where in the other - * ringbuffers the text and dictionary strings are located. + * ringbuffer the text strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long * integer (ID) that maps to a desc_ring index followed by the text * string of the record. * - * dict_data_ring - * A ring of data blocks. A data block consists of an unsigned long - * integer (ID) that maps to a desc_ring index followed by the dictionary - * string of the record. - * * The internal state information of a descriptor is the key element to allow * readers and writers to locklessly synchronize access to the data. * @@ -40,8 +35,8 @@ * ~~~~~~~~~~~~~~~ * The descriptor ring is an array of descriptors. A descriptor contains * essential meta data to track the data of a printk record using - * blk_lpos structs pointing to associated text and dictionary data blocks - * (see "Data Rings" below). Each descriptor is assigned an ID that maps + * blk_lpos structs pointing to associated text data blocks (see + * "Data Rings" below). Each descriptor is assigned an ID that maps * directly to index values of the descriptor array and has a state. The ID * and the state are bitwise combined into a single descriptor field named * @state_var, allowing ID and state to be synchronously and atomically @@ -62,8 +57,8 @@ * writer cannot reopen the descriptor. * * reusable - * The record exists, but its text and/or dictionary data may no longer - * be available. + * The record exists, but its text and/or meta data may no longer be + * available. * * Querying the @state_var of a record requires providing the ID of the * descriptor to query. This can yield a possible fifth (pseudo) state: @@ -77,7 +72,7 @@ * When a new descriptor should be created (and the ring is full), the tail * descriptor is invalidated by first transitioning to the reusable state and * then invalidating all tail data blocks up to and including the data blocks - * associated with the tail descriptor (for text and dictionary rings). Then + * associated with the tail descriptor (for the text ring). Then * @tail_id is advanced, followed by advancing @head_id. And finally the * @state_var of the new descriptor is initialized to the new ID and reserved * state. @@ -108,13 +103,9 @@ * 3) When a record is committed via prb_commit() and a newer record * already exists, the record being committed is automatically finalized. * - * Data Rings - * ~~~~~~~~~~ - * The two data rings (text and dictionary) function identically. They exist - * separately so that their buffer sizes can be individually set and they do - * not affect one another. - * - * Data rings are byte arrays composed of data blocks. Data blocks are + * Data Ring + * ~~~~~~~~~ + * The text data ring is a byte array composed of data blocks. Data blocks are * referenced by blk_lpos structs that point to the logical position of the * beginning of a data block and the beginning of the next adjacent data * block. Logical positions are mapped directly to index values of the byte @@ -165,34 +156,28 @@ * examples a global ringbuffer (test_rb) is available (which is not the * actual ringbuffer used by printk):: * - * DEFINE_PRINTKRB(test_rb, 15, 5, 3); + * DEFINE_PRINTKRB(test_rb, 15, 5); * * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of - * 1 MiB (2 ^ (15 + 5)) for text data and 256 KiB (2 ^ (15 + 3)) for - * dictionary data. + * 1 MiB (2 ^ (15 + 5)) for text data. * * Sample writer code:: * - * const char *dictstr = "dictionary text"; * const char *textstr = "message text"; * struct prb_reserved_entry e; * struct printk_record r; * * // specify how much to allocate - * prb_rec_init_wr(&r, strlen(textstr) + 1, strlen(dictstr) + 1); + * prb_rec_init_wr(&r, strlen(textstr) + 1); * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); - * r.info->text_len = strlen(textstr); - * - * // dictionary allocation may have failed - * if (r.dict_buf) { - * snprintf(r.dict_buf, r.dict_buf_size, "%s", dictstr); - * r.info->dict_len = strlen(dictstr); - * } * + * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); * + * // commit and finalize the record * prb_final_commit(&e); * } * @@ -203,8 +188,9 @@ * Sample writer code (record extending):: * * // alternate rest of previous example - * r.info->ts_nsec = local_clock(); + * * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit the record (but do not finalize yet) @@ -214,7 +200,7 @@ * ... * * // specify additional 5 bytes text space to extend - * prb_rec_init_wr(&r, 5, 0); + * prb_rec_init_wr(&r, 5); * * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id())) { * snprintf(&r.text_buf[r.info->text_len], @@ -222,6 +208,7 @@ * * r.info->text_len += 5; * + * // commit and finalize the record * prb_final_commit(&e); * } * @@ -230,11 +217,9 @@ * struct printk_info info; * struct printk_record r; * char text_buf[32]; - * char dict_buf[32]; * u64 seq; * - * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf), - * &dict_buf[0], sizeof(dict_buf)); + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * * prb_for_each_record(0, &test_rb, &seq, &r) { * if (info.seq != seq) @@ -245,13 +230,8 @@ * text_buf[r.text_buf_size - 1] = 0; * } * - * if (info.dict_len > r.dict_buf_size) { - * pr_warn("record %llu dict truncated\n", info.seq); - * dict_buf[r.dict_buf_size - 1] = 0; - * } - * - * pr_info("%llu: %llu: %s;%s\n", info.seq, info.ts_nsec, - * &text_buf[0], info.dict_len ? &dict_buf[0] : ""); + * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, + * &text_buf[0]); * } * * Note that additional less convenient reader functions are available to @@ -495,8 +475,6 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, */ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ - memcpy(&desc_out->dict_blk_lpos, &desc->dict_blk_lpos, - sizeof(desc_out->dict_blk_lpos)); /* also part of desc_read:C */ if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) @@ -571,7 +549,7 @@ static void desc_make_reusable(struct prb_desc_ring *desc_ring, } /* - * Given a data ring (text or dict), put the associated descriptor of each + * Given the text data ring, put the associated descriptor of each * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either @@ -586,21 +564,12 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long *lpos_out) { struct prb_desc_ring *desc_ring = &rb->desc_ring; - struct prb_data_blk_lpos *blk_lpos; struct prb_data_block *blk; enum desc_state d_state; struct prb_desc desc; + struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; unsigned long id; - /* - * Using the provided @data_ring, point @blk_lpos to the correct - * blk_lpos within the local copy of the descriptor. - */ - if (data_ring == &rb->text_data_ring) - blk_lpos = &desc.text_blk_lpos; - else - blk_lpos = &desc.dict_blk_lpos; - /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin); @@ -839,8 +808,6 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) return false; - if (!data_push_tail(rb, &rb->dict_data_ring, desc.dict_blk_lpos.next)) - return false; /* * Check the next descriptor after @tail_id before pushing the tail @@ -1347,9 +1314,8 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * data. * * The writer specifies the text size to extend (not the new total size) by - * setting the @text_buf_size field of @r. Extending dictionaries is not - * supported, so @dict_buf_size of @r should be set to 0. To ensure proper - * initialization of @r, prb_rec_init_wr() should be used. + * setting the @text_buf_size field of @r. To ensure proper initialization + * of @r, prb_rec_init_wr() should be used. * * This function will fail if @caller_id does not match the caller ID of the * newest record. In that case the caller must reserve new data using @@ -1364,9 +1330,6 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * * - @r->text_buf_size is set to the new total size of the buffer. * - * - @r->dict_buf and @r->dict_buf_size are cleared because extending - * the dict buffer is not supported. - * * - @r->info is not touched so that @r->info->text_len could be used * to append the text. * @@ -1375,8 +1338,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * * Important: All @r->info fields will already be set with the current values * for the record. I.e. @r->info->text_len will be less than - * @text_buf_size and @r->info->dict_len may be set, even though - * @dict_buf_size is 0. Writers can use @r->info->text_len to know + * @text_buf_size. Writers can use @r->info->text_len to know * where concatenation begins and writers should update * @r->info->text_len after concatenating. */ @@ -1454,10 +1416,6 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (r->text_buf_size && !r->text_buf) goto fail; - /* Although dictionary data may be in use, it cannot be extended. */ - r->dict_buf = NULL; - r->dict_buf_size = 0; - r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); @@ -1494,27 +1452,21 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) * * This is the public function available to writers to reserve data. * - * The writer specifies the text and dict sizes to reserve by setting the - * @text_buf_size and @dict_buf_size fields of @r, respectively. Dictionaries - * are optional, so @dict_buf_size is allowed to be 0. To ensure proper - * initialization of @r, prb_rec_init_wr() should be used. + * The writer specifies the text size to reserve by setting the + * @text_buf_size field of @r. To ensure proper initialization of @r, + * prb_rec_init_wr() should be used. * * Context: Any context. Disables local interrupts on success. * Return: true if at least text data could be allocated, otherwise false. * - * On success, the fields @info, @text_buf, @dict_buf of @r will be set by - * this function and should be filled in by the writer before committing. Also + * On success, the fields @info and @text_buf of @r will be set by this + * function and should be filled in by the writer before committing. Also * on success, prb_record_text_space() can be used on @e to query the actual * space used for the text data block. * - * If the function fails to reserve dictionary space (but all else succeeded), - * it will still report success. In that case @dict_buf is set to NULL and - * @dict_buf_size is set to 0. Writers must check this before writing to - * dictionary space. - * - * Important: @info->text_len and @info->dict_len need to be set correctly by - * the writer in order for data to be readable and/or extended. - * Their values are initialized to 0. + * Important: @info->text_len needs to be set correctly by the writer in + * order for data to be readable and/or extended. Its value + * is initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) @@ -1528,9 +1480,6 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; - if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) - goto fail; - /* * Descriptors in the reserved state act as blockers to all further * reservations once the desc_ring has fully wrapped. Disable @@ -1598,15 +1547,6 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, goto fail; } - r->dict_buf = data_alloc(rb, &rb->dict_data_ring, r->dict_buf_size, - &d->dict_blk_lpos, id); - /* - * If dict data allocation fails, the caller can still commit - * text. But dictionary information will not be available. - */ - if (r->dict_buf_size && !r->dict_buf) - r->dict_buf_size = 0; - r->info = info; /* Record full text space used by record. */ @@ -1869,17 +1809,6 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, return -ENOENT; } - /* - * Copy dict data. Although this should not fail, dict data is not - * important. So if it fails, modify the copied meta data to report - * that there is no dict data, thus silently dropping the dict data. - */ - if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, info->dict_len, - r->dict_buf, r->dict_buf_size, NULL)) { - if (r->info) - r->info->dict_len = 0; - } - /* Ensure the record is still finalized and has the same @seq. */ return desc_read_finalized_seq(desc_ring, id, seq, &desc); } @@ -1974,7 +1903,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * * This is the public function available to readers to read a record. * - * The reader provides the @info, @text_buf, @dict_buf buffers of @r to be + * The reader provides the @info and @text_buf buffers of @r to be * filled in. Any of the buffer pointers can be set to NULL if the reader * is not interested in that data. To ensure proper initialization of @r, * prb_rec_init_rd() should be used. @@ -2022,7 +1951,7 @@ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, { struct printk_record r; - prb_rec_init_rd(&r, info, NULL, 0, NULL, 0); + prb_rec_init_rd(&r, info, NULL, 0); return _prb_read_valid(rb, &seq, &r, line_count); } @@ -2084,8 +2013,6 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) * @rb: The ringbuffer to initialize. * @text_buf: The data buffer for text data. * @textbits: The size of @text_buf as a power-of-2 value. - * @dict_buf: The data buffer for dictionary data. - * @dictbits: The size of @dict_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. * @infos: The printk_info buffer for ringbuffer records. @@ -2099,7 +2026,6 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) */ void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, - char *dict_buf, unsigned int dictbits, struct prb_desc *descs, unsigned int descbits, struct printk_info *infos) { @@ -2117,18 +2043,11 @@ void prb_init(struct printk_ringbuffer *rb, atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); - rb->dict_data_ring.size_bits = dictbits; - rb->dict_data_ring.data = dict_buf; - atomic_long_set(&rb->dict_data_ring.head_lpos, BLK0_LPOS(dictbits)); - atomic_long_set(&rb->dict_data_ring.tail_lpos, BLK0_LPOS(dictbits)); - atomic_long_set(&rb->fail, 0); atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = FAILED_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = FAILED_LPOS; infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 480499ce3c6b..0adaa685d1ca 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -9,15 +9,13 @@ /* * Meta information about each stored message. * - * All fields are set and used by the printk code except for - * @seq, @text_len, @dict_len, which are set and/or modified - * by the ringbuffer code. + * All fields are set by the printk code except for @seq, which is + * set by the ringbuffer code. */ struct printk_info { u64 seq; /* sequence number */ u64 ts_nsec; /* timestamp in nanoseconds */ u16 text_len; /* length of text message */ - u16 dict_len; /* length of dictionary message */ u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ @@ -30,23 +28,20 @@ struct printk_info { * A structure providing the buffers, used by writers and readers. * * Writers: - * Using prb_rec_init_wr(), a writer sets @text_buf_size and @dict_buf_size - * before calling prb_reserve(). On success, prb_reserve() sets @info, - * @text_buf, @dict_buf to buffers reserved for that writer. + * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling + * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to + * buffers reserved for that writer. * * Readers: * Using prb_rec_init_rd(), a reader sets all fields before calling - * prb_read_valid(). Note that the reader provides the @info, @text_buf, - * @dict_buf buffers. On success, the struct pointed to by @info will be - * filled and the char arrays pointed to by @text_buf and @dict_buf will - * be filled with text and dict data. + * prb_read_valid(). Note that the reader provides the @info and @text_buf, + * buffers. On success, the struct pointed to by @info will be filled and + * the char array pointed to by @text_buf will be filled with text data. */ struct printk_record { struct printk_info *info; char *text_buf; - char *dict_buf; unsigned int text_buf_size; - unsigned int dict_buf_size; }; /* Specifies the logical position and span of a data block. */ @@ -63,7 +58,6 @@ struct prb_data_blk_lpos { struct prb_desc { atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; - struct prb_data_blk_lpos dict_blk_lpos; }; /* A ringbuffer of "ID + data" elements. */ @@ -92,7 +86,6 @@ struct prb_desc_ring { struct printk_ringbuffer { struct prb_desc_ring desc_ring; struct prb_data_ring text_data_ring; - struct prb_data_ring dict_data_ring; atomic_long_t fail; }; @@ -236,9 +229,7 @@ enum desc_state { * Note: The specified external buffer must be of the size: * 2 ^ (descbits + avgtextbits) */ -#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, text_buf) \ -static char _##name##_dict[1U << ((avgdictbits) + (descbits))] \ - __aligned(__alignof__(unsigned long)); \ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ @@ -246,7 +237,6 @@ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ .text_blk_lpos = FAILED_BLK_LPOS, \ - .dict_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ @@ -275,12 +265,6 @@ static struct printk_ringbuffer name = { \ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ }, \ - .dict_data_ring = { \ - .size_bits = (avgtextbits) + (descbits), \ - .data = &_##name##_dict[0], \ - .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ - .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ - }, \ .fail = ATOMIC_LONG_INIT(0), \ } @@ -290,17 +274,15 @@ static struct printk_ringbuffer name = { \ * @name: The name of the ringbuffer variable. * @descbits: The number of descriptors as a power-of-2 value. * @avgtextbits: The average text data size per record as a power-of-2 value. - * @avgdictbits: The average dictionary data size per record as a - * power-of-2 value. * * This is a macro for defining a ringbuffer and all internal structures * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a * variant where the text data buffer can be specified externally. */ -#define DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits) \ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ -_DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, &_##name##_text[0]) +_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) /* Writer Interface */ @@ -309,26 +291,13 @@ _DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, &_##name##_text[0]) * * @r: The record to initialize. * @text_buf_size: The needed text buffer size. - * @dict_buf_size: The needed dictionary buffer size. - * - * Initialize all the fields that a writer is interested in. If - * @dict_buf_size is 0, a dictionary buffer will not be reserved. - * @text_buf_size must be greater than 0. - * - * Note that although @dict_buf_size may be initialized to non-zero, - * its value must be rechecked after a successful call to prb_reserve() - * to verify a dictionary buffer was actually reserved. Dictionary buffer - * reservation is allowed to fail. */ static inline void prb_rec_init_wr(struct printk_record *r, - unsigned int text_buf_size, - unsigned int dict_buf_size) + unsigned int text_buf_size) { r->info = NULL; r->text_buf = NULL; - r->dict_buf = NULL; r->text_buf_size = text_buf_size; - r->dict_buf_size = dict_buf_size; } bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, @@ -340,7 +309,6 @@ void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, - char *dict_buf, unsigned int dict_buf_size, struct prb_desc *descs, unsigned int descs_count_bits, struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); @@ -354,8 +322,6 @@ unsigned int prb_record_text_space(struct prb_reserved_entry *e); * @info: A buffer to store record meta-data. * @text_buf: A buffer to store text data. * @text_buf_size: The size of @text_buf. - * @dict_buf: A buffer to store dictionary data. - * @dict_buf_size: The size of @dict_buf. * * Initialize all the fields that a reader is interested in. All arguments * (except @r) are optional. Only record data for arguments that are @@ -363,14 +329,11 @@ unsigned int prb_record_text_space(struct prb_reserved_entry *e); */ static inline void prb_rec_init_rd(struct printk_record *r, struct printk_info *info, - char *text_buf, unsigned int text_buf_size, - char *dict_buf, unsigned int dict_buf_size) + char *text_buf, unsigned int text_buf_size) { r->info = info; r->text_buf = text_buf; - r->dict_buf = dict_buf; r->text_buf_size = text_buf_size; - r->dict_buf_size = dict_buf_size; } /** -- cgit v1.2.3 From 59f8bcca1ef6a5326f7c127e11b949e745bfa230 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 30 Sep 2020 11:07:33 +0206 Subject: printk: avoid and/or handle record truncation If a reader provides a buffer that is smaller than the message text, the @text_len field of @info will have a value larger than the buffer size. If readers blindly read @text_len bytes of data without checking the size, they will read beyond their buffer. Add this check to record_print_text() to properly recognize when such truncation has occurred. Add a maximum size argument to the ringbuffer function to extend records so that records can not be created that are larger than the buffer size of readers. When extending records (LOG_CONT), do not extend records beyond LOG_LINE_MAX since that is the maximum size available in the buffers used by consoles and syslog. Fixes: f5f022e53b87 ("printk: reimplement log_cont using record extension") Reported-by: Marek Szyprowski Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200930090134.8723-2-john.ogness@linutronix.de --- kernel/printk/printk.c | 9 ++++++++- kernel/printk/printk_ringbuffer.c | 12 ++++++++++-- kernel/printk/printk_ringbuffer.h | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1fe3d0cb2fe0..15cd73da0528 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1349,6 +1349,13 @@ static size_t record_print_text(struct printk_record *r, bool syslog, size_t len = 0; char *next; + /* + * If the message was truncated because the buffer was not large + * enough, treat the available text as if it were the full text. + */ + if (text_len > buf_size) + text_len = buf_size; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* @@ -1903,7 +1910,7 @@ static size_t log_output(int facility, int level, enum log_flags lflags, struct printk_record r; prb_rec_init_wr(&r, text_len); - if (prb_reserve_in_last(&e, prb, &r, caller_id)) { + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { memcpy(&r.text_buf[r.info->text_len], text, text_len); r.info->text_len += text_len; if (lflags & LOG_NEWLINE) { diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 13b94b92342e..2493348a1631 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -202,7 +202,8 @@ * // specify additional 5 bytes text space to extend * prb_rec_init_wr(&r, 5); * - * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id())) { + * // try to extend, but only if it does not exceed 32 bytes + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * @@ -1309,6 +1310,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * @rb: The ringbuffer to re-reserve and extend data in. * @r: The record structure to allocate buffers for. * @caller_id: The caller ID of the caller (reserving writer). + * @max_size: Fail if the extended size would be greater than this. * * This is the public function available to writers to re-reserve and extend * data. @@ -1343,7 +1345,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * @r->info->text_len after concatenating. */ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, - struct printk_record *r, u32 caller_id) + struct printk_record *r, u32 caller_id, unsigned int max_size) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; @@ -1389,6 +1391,9 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; + if (r->text_buf_size > max_size) + goto fail; + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); } else { @@ -1410,6 +1415,9 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; + if (r->text_buf_size > max_size) + goto fail; + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 0adaa685d1ca..5dc9d022db07 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -303,7 +303,7 @@ static inline void prb_rec_init_wr(struct printk_record *r, bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, - struct printk_record *r, u32 caller_id); + struct printk_record *r, u32 caller_id, unsigned int max_size); void prb_commit(struct prb_reserved_entry *e); void prb_final_commit(struct prb_reserved_entry *e); -- cgit v1.2.3 From 0463d04ea03a12d8a5aad42197a5945dfd78d7d6 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 30 Sep 2020 11:07:34 +0206 Subject: printk: reduce setup_text_buf size to LOG_LINE_MAX @setup_text_buf only copies the original text messages (without any prefix or extended text). It only needs to be LOG_LINE_MAX in size. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200930090134.8723-3-john.ogness@linutronix.de --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/printk') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 15cd73da0528..f5c2945d1e3f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1095,7 +1095,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, return prb_record_text_space(&e); } -static char setup_text_buf[CONSOLE_EXT_LOG_MAX] __initdata; +static char setup_text_buf[LOG_LINE_MAX] __initdata; void __init setup_log_buf(int early) { -- cgit v1.2.3