From 2fc4876ea8a9932e0d0bd84daf638186fcadd01f Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 31 Aug 2017 13:18:22 +0200 Subject: [PATCH 1/8] s390/mm: use VM_BUG_ON in crst_table_[upgrade|downgrade] The BUG_ON in crst_table_[upgrade|downgrade] is a debugging aid, replace it with VM_BUG_ON. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pgalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index c5b74dd61197..05b5b1b0a8d9 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -83,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) int rc, notify; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - BUG_ON(mm->context.asce_limit < _REGION2_SIZE); + VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); if (end >= TASK_SIZE_MAX) return -ENOMEM; rc = 0; @@ -124,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd_t *pgd; /* downgrade should only happen from 3 to 2 levels (compat only) */ - BUG_ON(mm->context.asce_limit != _REGION2_SIZE); + VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); if (current->active_mm == mm) { clear_user_asce(); From e7fc5146cfe4f1b10f2ed6c36b65248aa948abe8 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 8 Nov 2016 07:09:13 +0100 Subject: [PATCH 2/8] s390/zcrypt: externalize test AP queue Under certain specified conditions, the Test AP Queue (TAPQ) subfunction of the Process Adjunct Processor Queue (PQAP) instruction will be intercepted by a guest VM. The guest VM must have a means for executing the intercepted instruction. The vfio_ap driver will provide an interface to execute the PQAP(TAPQ) instruction subfunction on behalf of a guest VM. The code for executing the AP instructions currently resides in the AP bus. This patch refactors the AP bus code to externalize access to the PQAP(TAPQ) instruction subfunction to make it available to the vfio_ap driver. Signed-off-by: Tony Krowiak Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ap.h | 64 ++++++++++++++++++++++++++++++++++ drivers/s390/crypto/ap_bus.c | 28 +++++++++++---- drivers/s390/crypto/ap_bus.h | 36 +------------------ drivers/s390/crypto/ap_queue.c | 2 +- 4 files changed, 88 insertions(+), 42 deletions(-) create mode 100644 arch/s390/include/asm/ap.h diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h new file mode 100644 index 000000000000..0f3e6f38db55 --- /dev/null +++ b/arch/s390/include/asm/ap.h @@ -0,0 +1,64 @@ +/* + * Adjunct processor (AP) interfaces + * + * Copyright IBM Corp. 2017 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): Tony Krowiak + * Martin Schwidefsky + * Harald Freudenberger + */ + +#ifndef _ASM_S390_AP_H_ +#define _ASM_S390_AP_H_ + +/** + * The ap_qid_t identifier of an ap queue. + * If the AP facilities test (APFT) facility is available, + * card and queue index are 8 bit values, otherwise + * card index is 6 bit and queue index a 4 bit value. + */ +typedef unsigned int ap_qid_t; + +#define AP_MKQID(_card, _queue) (((_card) & 63) << 8 | ((_queue) & 255)) +#define AP_QID_CARD(_qid) (((_qid) >> 8) & 63) +#define AP_QID_QUEUE(_qid) ((_qid) & 255) + +/** + * struct ap_queue_status - Holds the AP queue status. + * @queue_empty: Shows if queue is empty + * @replies_waiting: Waiting replies + * @queue_full: Is 1 if the queue is full + * @irq_enabled: Shows if interrupts are enabled for the AP + * @response_code: Holds the 8 bit response code + * + * The ap queue status word is returned by all three AP functions + * (PQAP, NQAP and DQAP). There's a set of flags in the first + * byte, followed by a 1 byte response code. + */ +struct ap_queue_status { + unsigned int queue_empty : 1; + unsigned int replies_waiting : 1; + unsigned int queue_full : 1; + unsigned int _pad1 : 4; + unsigned int irq_enabled : 1; + unsigned int response_code : 8; + unsigned int _pad2 : 16; +}; + +/** + * ap_test_queue(): Test adjunct processor queue. + * @qid: The AP queue number + * @tbit: Test facilities bit + * @info: Pointer to queue descriptor + * + * Returns AP queue status structure. + */ +struct ap_queue_status ap_test_queue(ap_qid_t qid, + int tbit, + unsigned long *info); + +#endif /* _ASM_S390_AP_H_ */ diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 6dee598979e7..4fcfa8d4e0b5 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -165,20 +165,34 @@ static int ap_configuration_available(void) return test_facility(12); } +/** + * ap_apft_available(): Test if AP facilities test (APFT) + * facility is available. + * + * Returns 1 if APFT is is available. + */ +static int ap_apft_available(void) +{ + return test_facility(15); +} + /** * ap_test_queue(): Test adjunct processor queue. * @qid: The AP queue number + * @tbit: Test facilities bit * @info: Pointer to queue descriptor * * Returns AP queue status structure. */ -static inline struct ap_queue_status -ap_test_queue(ap_qid_t qid, unsigned long *info) +struct ap_queue_status ap_test_queue(ap_qid_t qid, + int tbit, + unsigned long *info) { - if (test_facility(15)) - qid |= 1UL << 23; /* set APFT T bit*/ + if (tbit) + qid |= 1UL << 23; /* set T bit*/ return ap_tapq(qid, info); } +EXPORT_SYMBOL(ap_test_queue); static inline int ap_query_configuration(void) { @@ -261,7 +275,7 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type, if (!ap_test_config_card_id(AP_QID_CARD(qid))) return -ENODEV; - status = ap_test_queue(qid, &info); + status = ap_test_queue(qid, ap_apft_available(), &info); switch (status.response_code) { case AP_RESPONSE_NORMAL: *queue_depth = (int)(info & 0xff); @@ -940,7 +954,9 @@ static int ap_select_domain(void) for (j = 0; j < AP_DEVICES; j++) { if (!ap_test_config_card_id(j)) continue; - status = ap_test_queue(AP_MKQID(j, i), NULL); + status = ap_test_queue(AP_MKQID(j, i), + ap_apft_available(), + NULL); if (status.response_code != AP_RESPONSE_NORMAL) continue; count++; diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 4dc7c88fb054..f07698d41f77 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -28,6 +28,7 @@ #include #include +#include #define AP_DEVICES 64 /* Number of AP devices. */ #define AP_DOMAINS 256 /* Number of AP domains. */ @@ -40,41 +41,6 @@ extern int ap_domain_index; extern spinlock_t ap_list_lock; extern struct list_head ap_card_list; -/** - * The ap_qid_t identifier of an ap queue. It contains a - * 6 bit card index and a 4 bit queue index (domain). - */ -typedef unsigned int ap_qid_t; - -#define AP_MKQID(_card, _queue) (((_card) & 63) << 8 | ((_queue) & 255)) -#define AP_QID_CARD(_qid) (((_qid) >> 8) & 63) -#define AP_QID_QUEUE(_qid) ((_qid) & 255) - -/** - * structy ap_queue_status - Holds the AP queue status. - * @queue_empty: Shows if queue is empty - * @replies_waiting: Waiting replies - * @queue_full: Is 1 if the queue is full - * @pad: A 4 bit pad - * @int_enabled: Shows if interrupts are enabled for the AP - * @response_code: Holds the 8 bit response code - * @pad2: A 16 bit pad - * - * The ap queue status word is returned by all three AP functions - * (PQAP, NQAP and DQAP). There's a set of flags in the first - * byte, followed by a 1 byte response code. - */ -struct ap_queue_status { - unsigned int queue_empty : 1; - unsigned int replies_waiting : 1; - unsigned int queue_full : 1; - unsigned int pad1 : 4; - unsigned int int_enabled : 1; - unsigned int response_code : 8; - unsigned int pad2 : 16; -} __packed; - - static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) { return (*ptr & (0x80000000u >> nr)) != 0; diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 0f1a5d02acb0..c906fb73a215 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -362,7 +362,7 @@ static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq) /* Get the status with TAPQ */ status = ap_tapq(aq->qid, NULL); - if (status.int_enabled == 1) { + if (status.irq_enabled == 1) { /* Irqs are now enabled */ aq->interrupt = AP_INTR_ENABLED; aq->state = (aq->queue_count > 0) ? From 050349b5b71df52c24989037bd6515cb54c3ef35 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 8 Nov 2016 11:54:28 +0100 Subject: [PATCH 3/8] s390/zcrypt: externalize AP config info query KVM has a need to fetch the crypto configuration information as it is returned by the PQAP(QCI) instruction. This patch introduces a new API ap_query_configuration() which provides this info in a handy way for the caller. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ap.h | 26 ++++++++++++++++++++++++++ drivers/s390/crypto/ap_bus.c | 21 ++++++++++++++++----- drivers/s390/crypto/ap_bus.h | 11 ----------- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index 0f3e6f38db55..8cefd6ed981d 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -61,4 +61,30 @@ struct ap_queue_status ap_test_queue(ap_qid_t qid, int tbit, unsigned long *info); +struct ap_config_info { + unsigned int apsc : 1; /* S bit */ + unsigned int apxa : 1; /* N bit */ + unsigned int qact : 1; /* C bit */ + unsigned int rc8a : 1; /* R bit */ + unsigned char _reserved1 : 4; + unsigned char _reserved2[3]; + unsigned char Na; /* max # of APs - 1 */ + unsigned char Nd; /* max # of Domains - 1 */ + unsigned char _reserved3[10]; + unsigned int apm[8]; /* AP ID mask */ + unsigned int aqm[8]; /* AP queue mask */ + unsigned int adm[8]; /* AP domain mask */ + unsigned char _reserved4[16]; +} __aligned(8); + +/* + * ap_query_configuration(): Fetch cryptographic config info + * + * Returns the ap configuration info fetched via PQAP(QCI). + * On success 0 is returned, on failure a negative errno + * is returned, e.g. if the PQAP(QCI) instruction is not + * available, the return value will be -EOPNOTSUPP. + */ +int ap_query_configuration(struct ap_config_info *info); + #endif /* _ASM_S390_AP_H_ */ diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 4fcfa8d4e0b5..5f0be2040272 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -194,12 +194,23 @@ struct ap_queue_status ap_test_queue(ap_qid_t qid, } EXPORT_SYMBOL(ap_test_queue); -static inline int ap_query_configuration(void) +/* + * ap_query_configuration(): Fetch cryptographic config info + * + * Returns the ap configuration info fetched via PQAP(QCI). + * On success 0 is returned, on failure a negative errno + * is returned, e.g. if the PQAP(QCI) instruction is not + * available, the return value will be -EOPNOTSUPP. + */ +int ap_query_configuration(struct ap_config_info *info) { - if (!ap_configuration) + if (!ap_configuration_available()) return -EOPNOTSUPP; - return ap_qci(ap_configuration); + if (!info) + return -EINVAL; + return ap_qci(info); } +EXPORT_SYMBOL(ap_query_configuration); /** * ap_init_configuration(): Allocate and query configuration array. @@ -212,7 +223,7 @@ static void ap_init_configuration(void) ap_configuration = kzalloc(sizeof(*ap_configuration), GFP_KERNEL); if (!ap_configuration) return; - if (ap_query_configuration() != 0) { + if (ap_query_configuration(ap_configuration) != 0) { kfree(ap_configuration); ap_configuration = NULL; return; @@ -1009,7 +1020,7 @@ static void ap_scan_bus(struct work_struct *unused) AP_DBF(DBF_DEBUG, "ap_scan_bus running\n"); - ap_query_configuration(); + ap_query_configuration(ap_configuration); if (ap_select_domain() != 0) goto out; diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index f07698d41f77..754cf2223cfb 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -204,17 +204,6 @@ struct ap_message { struct ap_message *); }; -struct ap_config_info { - unsigned int special_command:1; - unsigned int ap_extended:1; - unsigned char reserved1:6; - unsigned char reserved2[15]; - unsigned int apm[8]; /* AP ID mask */ - unsigned int aqm[8]; /* AP queue mask */ - unsigned int adm[8]; /* AP domain mask */ - unsigned char reserved4[16]; -} __packed; - /** * ap_init_message() - Initialize ap_message. * Initialize a message before using. Otherwise this might result in From 46fde9a9d204e322cf4fda391c30213633dcc17f Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 9 Nov 2016 15:00:23 +0100 Subject: [PATCH 4/8] s390/zcrypt: externalize AP queue interrupt control KVM has a need to control the interrupts on real and virtualized AP queue devices. This fix provides a new function to control the interrupt facilities of an AP queue device. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ap.h | 36 ++++++++++++++++++++++++++++++++++ drivers/s390/crypto/ap_asm.h | 9 ++++++--- drivers/s390/crypto/ap_queue.c | 24 ++++++++++++++++++++++- 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index 8cefd6ed981d..c02f4aba88a6 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -87,4 +87,40 @@ struct ap_config_info { */ int ap_query_configuration(struct ap_config_info *info); +/* + * struct ap_qirq_ctrl - convenient struct for easy invocation + * of the ap_queue_irq_ctrl() function. This struct is passed + * as GR1 parameter to the PQAP(AQIC) instruction. For details + * please see the AR documentation. + */ +struct ap_qirq_ctrl { + unsigned int _res1 : 8; + unsigned int zone : 8; /* zone info */ + unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ + unsigned int _res2 : 4; + unsigned int gisc : 3; /* guest isc field */ + unsigned int _res3 : 6; + unsigned int gf : 2; /* gisa format */ + unsigned int _res4 : 1; + unsigned int gisa : 27; /* gisa origin */ + unsigned int _res5 : 1; + unsigned int isc : 3; /* irq sub class */ +}; + +/** + * ap_queue_irq_ctrl(): Control interruption on a AP queue. + * @qid: The AP queue number + * @qirqctrl: struct ap_qirq_ctrl, see above + * @ind: The notification indicator byte + * + * Returns AP queue status. + * + * Control interruption on the given AP queue. + * Just a simple wrapper function for the low level PQAP(AQIC) + * instruction available for other kernel modules. + */ +struct ap_queue_status ap_queue_irq_ctrl(ap_qid_t qid, + struct ap_qirq_ctrl qirqctrl, + void *ind); + #endif /* _ASM_S390_AP_H_ */ diff --git a/drivers/s390/crypto/ap_asm.h b/drivers/s390/crypto/ap_asm.h index 287b4ad0999e..cd350345b3d2 100644 --- a/drivers/s390/crypto/ap_asm.h +++ b/drivers/s390/crypto/ap_asm.h @@ -69,16 +69,19 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid) } /** - * ap_aqic(): Enable interruption for a specific AP. + * ap_aqic(): Control interruption for a specific AP. * @qid: The AP queue number + * @qirqctrl: struct ap_qirq_ctrl (64 bit value) * @ind: The notification indicator byte * * Returns AP queue status. */ -static inline struct ap_queue_status ap_aqic(ap_qid_t qid, void *ind) +static inline struct ap_queue_status ap_aqic(ap_qid_t qid, + struct ap_qirq_ctrl qirqctrl, + void *ind) { register unsigned long reg0 asm ("0") = qid | (3UL << 24); - register unsigned long reg1_in asm ("1") = (8UL << 44) | AP_ISC; + register struct ap_qirq_ctrl reg1_in asm ("1") = qirqctrl; register struct ap_queue_status reg1_out asm ("1"); register void *reg2 asm ("2") = ind; diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index c906fb73a215..56b96edffd5b 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -15,6 +15,25 @@ #include "ap_bus.h" #include "ap_asm.h" +/** + * ap_queue_irq_ctrl(): Control interruption on a AP queue. + * @qirqctrl: struct ap_qirq_ctrl (64 bit value) + * @ind: The notification indicator byte + * + * Returns AP queue status. + * + * Control interruption on the given AP queue. + * Just a simple wrapper function for the low level PQAP(AQIC) + * instruction available for other kernel modules. + */ +struct ap_queue_status ap_queue_irq_ctrl(ap_qid_t qid, + struct ap_qirq_ctrl qirqctrl, + void *ind) +{ + return ap_aqic(qid, qirqctrl, ind); +} +EXPORT_SYMBOL(ap_queue_irq_ctrl); + /** * ap_queue_enable_interruption(): Enable interruption on an AP queue. * @qid: The AP queue number @@ -27,8 +46,11 @@ static int ap_queue_enable_interruption(struct ap_queue *aq, void *ind) { struct ap_queue_status status; + struct ap_qirq_ctrl qirqctrl = { 0 }; - status = ap_aqic(aq->qid, ind); + qirqctrl.ir = 1; + qirqctrl.isc = AP_ISC; + status = ap_aqic(aq->qid, qirqctrl, ind); switch (status.response_code) { case AP_RESPONSE_NORMAL: case AP_RESPONSE_OTHERWISE_CHANGED: From b3e5dc45fd1ec2aa1de6b80008f9295eb17e0659 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 16 Aug 2017 14:10:01 +0200 Subject: [PATCH 5/8] s390/mm: fix local TLB flushing vs. detach of an mm address space The local TLB flushing code keeps an additional mask in the mm.context, the cpu_attach_mask. At the time a global flush of an address space is done the cpu_attach_mask is copied to the mm_cpumask in order to avoid future global flushes in case the mm is used by a single CPU only after the flush. Trouble is that the reset of the mm_cpumask is racy against the detach of an mm address space by switch_mm. The current order is first the global TLB flush and then the copy of the cpu_attach_mask to the mm_cpumask. The order needs to be the other way around. Cc: Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 4 ++-- arch/s390/include/asm/tlbflush.h | 26 +++++--------------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 72e9ca83a668..8823e35f69a9 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -103,7 +103,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (prev == next) return; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - cpumask_set_cpu(cpu, mm_cpumask(next)); /* Clear old ASCE by loading the kernel ASCE. */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); @@ -121,7 +120,7 @@ static inline void finish_arch_post_lock_switch(void) preempt_disable(); while (atomic_read(&mm->context.flush_count)) cpu_relax(); - + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); if (mm->context.flush_mm) __tlb_flush_mm(mm); preempt_enable(); @@ -136,6 +135,7 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { switch_mm(prev, next, current); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); set_user_asce(next); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 4d759f8f4bc7..16fe2a3d9a03 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -48,23 +48,6 @@ static inline void __tlb_flush_global(void) * Flush TLB entries for a specific mm on all CPUs (in case gmap is used * this implicates multiple ASCEs!). */ -static inline void __tlb_flush_full(struct mm_struct *mm) -{ - preempt_disable(); - atomic_inc(&mm->context.flush_count); - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { - /* Local TLB flush */ - __tlb_flush_local(); - } else { - /* Global TLB flush */ - __tlb_flush_global(); - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); - } - atomic_dec(&mm->context.flush_count); - preempt_enable(); -} - static inline void __tlb_flush_mm(struct mm_struct *mm) { unsigned long gmap_asce; @@ -76,16 +59,18 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) */ preempt_disable(); atomic_inc(&mm->context.flush_count); + /* Reset TLB flush mask */ + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); + barrier(); gmap_asce = READ_ONCE(mm->context.gmap_asce); if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { if (gmap_asce) __tlb_flush_idte(gmap_asce); __tlb_flush_idte(mm->context.asce); } else { - __tlb_flush_full(mm); + /* Global TLB flush */ + __tlb_flush_global(); } - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); atomic_dec(&mm->context.flush_count); preempt_enable(); } @@ -99,7 +84,6 @@ static inline void __tlb_flush_kernel(void) } #else #define __tlb_flush_global() __tlb_flush_local() -#define __tlb_flush_full(mm) __tlb_flush_local() /* * Flush TLB entries for a specific ASCE on all CPUs. From 60f07c8ec5fae06c23e9fd7bab67dabce92b3414 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 17 Aug 2017 08:15:16 +0200 Subject: [PATCH 6/8] s390/mm: fix race on mm->context.flush_mm The order in __tlb_flush_mm_lazy is to flush TLB first and then clear the mm->context.flush_mm bit. This can lead to missed flushes as the bit can be set anytime, the order needs to be the other way aronud. But this leads to a different race, __tlb_flush_mm_lazy may be called on two CPUs concurrently. If mm->context.flush_mm is cleared first then another CPU can bypass __tlb_flush_mm_lazy although the first CPU has not done the flush yet. In a virtualized environment the time until the flush is finally completed can be arbitrarily long. Add a spinlock to serialize __tlb_flush_mm_lazy and use the function in finish_arch_post_lock_switch as well. Cc: Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu.h | 2 ++ arch/s390/include/asm/mmu_context.h | 4 ++-- arch/s390/include/asm/tlbflush.h | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bd6f30304518..3525fe6e7e4c 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -5,6 +5,7 @@ #include typedef struct { + spinlock_t lock; cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; @@ -27,6 +28,7 @@ typedef struct { } mm_context_t; #define INIT_MM_CONTEXT(name) \ + .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ .context.pgtable_lock = \ __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8823e35f69a9..484efe8f4234 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -17,6 +17,7 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + spin_lock_init(&mm->context.lock); spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); spin_lock_init(&mm->context.gmap_lock); @@ -121,8 +122,7 @@ static inline void finish_arch_post_lock_switch(void) while (atomic_read(&mm->context.flush_count)) cpu_relax(); cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); - if (mm->context.flush_mm) - __tlb_flush_mm(mm); + __tlb_flush_mm_lazy(mm); preempt_enable(); } set_fs(current->thread.mm_segment); diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 16fe2a3d9a03..b08d5bc2666e 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -101,10 +101,12 @@ static inline void __tlb_flush_kernel(void) static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { + spin_lock(&mm->context.lock); if (mm->context.flush_mm) { - __tlb_flush_mm(mm); mm->context.flush_mm = 0; + __tlb_flush_mm(mm); } + spin_unlock(&mm->context.lock); } /* From f28a4b4ddf8e7181c6c0bc45603d65c4ab6b14f9 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 17 Aug 2017 18:17:49 +0200 Subject: [PATCH 7/8] s390/mm: use a single lock for the fields in mm_context_t The three locks 'lock', 'pgtable_lock' and 'gmap_lock' in the mm_context_t can be reduced to a single lock. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu.h | 5 ----- arch/s390/include/asm/mmu_context.h | 2 -- arch/s390/mm/gmap.c | 8 ++++---- arch/s390/mm/pgalloc.c | 16 ++++++++-------- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 3525fe6e7e4c..3f46a6577b8d 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -9,9 +9,7 @@ typedef struct { cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; - spinlock_t pgtable_lock; struct list_head pgtable_list; - spinlock_t gmap_lock; struct list_head gmap_list; unsigned long gmap_asce; unsigned long asce; @@ -29,10 +27,7 @@ typedef struct { #define INIT_MM_CONTEXT(name) \ .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ - .context.pgtable_lock = \ - __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ - .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), static inline int tprot(unsigned long addr) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 484efe8f4234..3c9abedc323c 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -18,9 +18,7 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { spin_lock_init(&mm->context.lock); - spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); - spin_lock_init(&mm->context.gmap_lock); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 9e1494e3d849..2f66290c9b92 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -100,14 +100,14 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) if (!gmap) return NULL; gmap->mm = mm; - spin_lock(&mm->context.gmap_lock); + spin_lock(&mm->context.lock); list_add_rcu(&gmap->list, &mm->context.gmap_list); if (list_is_singular(&mm->context.gmap_list)) gmap_asce = gmap->asce; else gmap_asce = -1UL; WRITE_ONCE(mm->context.gmap_asce, gmap_asce); - spin_unlock(&mm->context.gmap_lock); + spin_unlock(&mm->context.lock); return gmap; } EXPORT_SYMBOL_GPL(gmap_create); @@ -248,7 +248,7 @@ void gmap_remove(struct gmap *gmap) spin_unlock(&gmap->shadow_lock); } /* Remove gmap from the pre-mm list */ - spin_lock(&gmap->mm->context.gmap_lock); + spin_lock(&gmap->mm->context.lock); list_del_rcu(&gmap->list); if (list_empty(&gmap->mm->context.gmap_list)) gmap_asce = 0; @@ -258,7 +258,7 @@ void gmap_remove(struct gmap *gmap) else gmap_asce = -1UL; WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); - spin_unlock(&gmap->mm->context.gmap_lock); + spin_unlock(&gmap->mm->context.lock); synchronize_rcu(); /* Put reference */ gmap_put(gmap); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 05b5b1b0a8d9..05f1f27e6708 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -188,7 +188,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Try to get a fragment of a 4K page as a 2K page table */ if (!mm_alloc_pgste(mm)) { table = NULL; - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); @@ -203,7 +203,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) list_del(&page->lru); } } - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); if (table) return table; } @@ -227,9 +227,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); } return table; } @@ -243,13 +243,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_mapcount, 1U << bit); if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); if (mask != 0) return; } @@ -275,13 +275,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, return; } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } From e443343e509aac82e7281020f25bf8fa0dd46ab7 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Mon, 22 May 2017 10:59:11 +0200 Subject: [PATCH 8/8] s390/dasd: blk-mq conversion Use new blk-mq interfaces. Use multiple queues and also use the block layer complete helper that finish the IO on the CPU that initiated it. Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 331 ++++++++++++++++--------------- drivers/s390/block/dasd_devmap.c | 8 +- drivers/s390/block/dasd_int.h | 19 +- 3 files changed, 193 insertions(+), 165 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 9c97ad1ee121..ea19b4ff87a2 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -62,7 +62,6 @@ MODULE_LICENSE("GPL"); static int dasd_alloc_queue(struct dasd_block *); static void dasd_setup_queue(struct dasd_block *); static void dasd_free_queue(struct dasd_block *); -static void dasd_flush_request_queue(struct dasd_block *); static int dasd_flush_block_queue(struct dasd_block *); static void dasd_device_tasklet(struct dasd_device *); static void dasd_block_tasklet(struct dasd_block *); @@ -158,7 +157,6 @@ struct dasd_block *dasd_alloc_block(void) /* open_count = 0 means device online but not in use */ atomic_set(&block->open_count, -1); - spin_lock_init(&block->request_queue_lock); atomic_set(&block->tasklet_scheduled, 0); tasklet_init(&block->tasklet, (void (*)(unsigned long)) dasd_block_tasklet, @@ -391,7 +389,6 @@ static int dasd_state_ready_to_basic(struct dasd_device *device) device->state = DASD_STATE_READY; return rc; } - dasd_flush_request_queue(block); dasd_destroy_partitions(block); block->blocks = 0; block->bp_block = 0; @@ -1645,8 +1642,10 @@ void dasd_generic_handle_state_change(struct dasd_device *device) dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING); dasd_schedule_device_bh(device); - if (device->block) + if (device->block) { dasd_schedule_block_bh(device->block); + blk_mq_run_hw_queues(device->block->request_queue, true); + } } EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); @@ -2638,6 +2637,7 @@ static void dasd_block_timeout(unsigned long ptr) dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); dasd_schedule_block_bh(block); + blk_mq_run_hw_queues(block->request_queue, true); } /* @@ -2677,115 +2677,11 @@ static void __dasd_process_erp(struct dasd_device *device, erp_fn(cqr); } -/* - * Fetch requests from the block device queue. - */ -static void __dasd_process_request_queue(struct dasd_block *block) -{ - struct request_queue *queue; - struct request *req; - struct dasd_ccw_req *cqr; - struct dasd_device *basedev; - unsigned long flags; - queue = block->request_queue; - basedev = block->base; - /* No queue ? Then there is nothing to do. */ - if (queue == NULL) - return; - - /* - * We requeue request from the block device queue to the ccw - * queue only in two states. In state DASD_STATE_READY the - * partition detection is done and we need to requeue requests - * for that. State DASD_STATE_ONLINE is normal block device - * operation. - */ - if (basedev->state < DASD_STATE_READY) { - while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, BLK_STS_IOERR); - return; - } - - /* - * if device is stopped do not fetch new requests - * except failfast is active which will let requests fail - * immediately in __dasd_block_start_head() - */ - if (basedev->stopped && !(basedev->features & DASD_FEATURE_FAILFAST)) - return; - - /* Now we try to fetch requests from the request queue */ - while ((req = blk_peek_request(queue))) { - if (basedev->features & DASD_FEATURE_READONLY && - rq_data_dir(req) == WRITE) { - DBF_DEV_EVENT(DBF_ERR, basedev, - "Rejecting write request %p", - req); - blk_start_request(req); - __blk_end_request_all(req, BLK_STS_IOERR); - continue; - } - if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) && - (basedev->features & DASD_FEATURE_FAILFAST || - blk_noretry_request(req))) { - DBF_DEV_EVENT(DBF_ERR, basedev, - "Rejecting failfast request %p", - req); - blk_start_request(req); - __blk_end_request_all(req, BLK_STS_TIMEOUT); - continue; - } - cqr = basedev->discipline->build_cp(basedev, block, req); - if (IS_ERR(cqr)) { - if (PTR_ERR(cqr) == -EBUSY) - break; /* normal end condition */ - if (PTR_ERR(cqr) == -ENOMEM) - break; /* terminate request queue loop */ - if (PTR_ERR(cqr) == -EAGAIN) { - /* - * The current request cannot be build right - * now, we have to try later. If this request - * is the head-of-queue we stop the device - * for 1/2 second. - */ - if (!list_empty(&block->ccw_queue)) - break; - spin_lock_irqsave( - get_ccwdev_lock(basedev->cdev), flags); - dasd_device_set_stop_bits(basedev, - DASD_STOPPED_PENDING); - spin_unlock_irqrestore( - get_ccwdev_lock(basedev->cdev), flags); - dasd_block_set_timer(block, HZ/2); - break; - } - DBF_DEV_EVENT(DBF_ERR, basedev, - "CCW creation failed (rc=%ld) " - "on request %p", - PTR_ERR(cqr), req); - blk_start_request(req); - __blk_end_request_all(req, BLK_STS_IOERR); - continue; - } - /* - * Note: callback is set to dasd_return_cqr_cb in - * __dasd_block_start_head to cover erp requests as well - */ - cqr->callback_data = (void *) req; - cqr->status = DASD_CQR_FILLED; - req->completion_data = cqr; - blk_start_request(req); - list_add_tail(&cqr->blocklist, &block->ccw_queue); - INIT_LIST_HEAD(&cqr->devlist); - dasd_profile_start(block, cqr, req); - } -} - static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) { struct request *req; - int status; blk_status_t error = BLK_STS_OK; + int status; req = (struct request *) cqr->callback_data; dasd_profile_end(cqr->block, cqr, req); @@ -2809,7 +2705,19 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) break; } } - __blk_end_request_all(req, error); + + /* + * We need to take care for ETIMEDOUT errors here since the + * complete callback does not get called in this case. + * Take care of all errors here and avoid additional code to + * transfer the error value to the complete callback. + */ + if (error) { + blk_mq_end_request(req, error); + blk_mq_run_hw_queues(req->q, true); + } else { + blk_mq_complete_request(req); + } } /* @@ -2938,27 +2846,30 @@ static void dasd_block_tasklet(struct dasd_block *block) struct list_head final_queue; struct list_head *l, *n; struct dasd_ccw_req *cqr; + struct dasd_queue *dq; atomic_set(&block->tasklet_scheduled, 0); INIT_LIST_HEAD(&final_queue); - spin_lock(&block->queue_lock); + spin_lock_irq(&block->queue_lock); /* Finish off requests on ccw queue */ __dasd_process_block_ccw_queue(block, &final_queue); - spin_unlock(&block->queue_lock); + spin_unlock_irq(&block->queue_lock); + /* Now call the callback function of requests with final status */ - spin_lock_irq(&block->request_queue_lock); list_for_each_safe(l, n, &final_queue) { cqr = list_entry(l, struct dasd_ccw_req, blocklist); + dq = cqr->dq; + spin_lock_irq(&dq->lock); list_del_init(&cqr->blocklist); __dasd_cleanup_cqr(cqr); + spin_unlock_irq(&dq->lock); } - spin_lock(&block->queue_lock); - /* Get new request from the block device request queue */ - __dasd_process_request_queue(block); + + spin_lock_irq(&block->queue_lock); /* Now check if the head of the ccw queue needs to be started. */ __dasd_block_start_head(block); - spin_unlock(&block->queue_lock); - spin_unlock_irq(&block->request_queue_lock); + spin_unlock_irq(&block->queue_lock); + if (waitqueue_active(&shutdown_waitq)) wake_up(&shutdown_waitq); dasd_put_device(block->base); @@ -2977,14 +2888,13 @@ static int _dasd_requeue_request(struct dasd_ccw_req *cqr) { struct dasd_block *block = cqr->block; struct request *req; - unsigned long flags; if (!block) return -EINVAL; - spin_lock_irqsave(&block->request_queue_lock, flags); + spin_lock_irq(&cqr->dq->lock); req = (struct request *) cqr->callback_data; - blk_requeue_request(block->request_queue, req); - spin_unlock_irqrestore(&block->request_queue_lock, flags); + blk_mq_requeue_request(req, false); + spin_unlock_irq(&cqr->dq->lock); return 0; } @@ -2999,6 +2909,7 @@ static int dasd_flush_block_queue(struct dasd_block *block) struct dasd_ccw_req *cqr, *n; int rc, i; struct list_head flush_queue; + unsigned long flags; INIT_LIST_HEAD(&flush_queue); spin_lock_bh(&block->queue_lock); @@ -3037,11 +2948,11 @@ static int dasd_flush_block_queue(struct dasd_block *block) goto restart_cb; } /* call the callback function */ - spin_lock_irq(&block->request_queue_lock); + spin_lock_irqsave(&cqr->dq->lock, flags); cqr->endclk = get_tod_clock(); list_del_init(&cqr->blocklist); __dasd_cleanup_cqr(cqr); - spin_unlock_irq(&block->request_queue_lock); + spin_unlock_irqrestore(&cqr->dq->lock, flags); } return rc; } @@ -3069,42 +2980,114 @@ EXPORT_SYMBOL(dasd_schedule_block_bh); /* * Dasd request queue function. Called from ll_rw_blk.c */ -static void do_dasd_request(struct request_queue *queue) +static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *qd) { - struct dasd_block *block; + struct dasd_block *block = hctx->queue->queuedata; + struct dasd_queue *dq = hctx->driver_data; + struct request *req = qd->rq; + struct dasd_device *basedev; + struct dasd_ccw_req *cqr; + blk_status_t rc = BLK_STS_OK; - block = queue->queuedata; + basedev = block->base; + spin_lock_irq(&dq->lock); + if (basedev->state < DASD_STATE_READY) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "device not ready for request %p", req); + rc = BLK_STS_IOERR; + goto out; + } + + /* + * if device is stopped do not fetch new requests + * except failfast is active which will let requests fail + * immediately in __dasd_block_start_head() + */ + if (basedev->stopped && !(basedev->features & DASD_FEATURE_FAILFAST)) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "device stopped request %p", req); + rc = BLK_STS_RESOURCE; + goto out; + } + + if (basedev->features & DASD_FEATURE_READONLY && + rq_data_dir(req) == WRITE) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "Rejecting write request %p", req); + rc = BLK_STS_IOERR; + goto out; + } + + if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) && + (basedev->features & DASD_FEATURE_FAILFAST || + blk_noretry_request(req))) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "Rejecting failfast request %p", req); + rc = BLK_STS_IOERR; + goto out; + } + + cqr = basedev->discipline->build_cp(basedev, block, req); + if (IS_ERR(cqr)) { + if (PTR_ERR(cqr) == -EBUSY || + PTR_ERR(cqr) == -ENOMEM || + PTR_ERR(cqr) == -EAGAIN) { + rc = BLK_STS_RESOURCE; + goto out; + } + DBF_DEV_EVENT(DBF_ERR, basedev, + "CCW creation failed (rc=%ld) on request %p", + PTR_ERR(cqr), req); + rc = BLK_STS_IOERR; + goto out; + } + /* + * Note: callback is set to dasd_return_cqr_cb in + * __dasd_block_start_head to cover erp requests as well + */ + cqr->callback_data = req; + cqr->status = DASD_CQR_FILLED; + cqr->dq = dq; + req->completion_data = cqr; + blk_mq_start_request(req); spin_lock(&block->queue_lock); - /* Get new request from the block device request queue */ - __dasd_process_request_queue(block); - /* Now check if the head of the ccw queue needs to be started. */ - __dasd_block_start_head(block); + list_add_tail(&cqr->blocklist, &block->ccw_queue); + INIT_LIST_HEAD(&cqr->devlist); + dasd_profile_start(block, cqr, req); + dasd_schedule_block_bh(block); spin_unlock(&block->queue_lock); + +out: + spin_unlock_irq(&dq->lock); + return rc; } /* * Block timeout callback, called from the block layer * - * request_queue lock is held on entry. - * * Return values: * BLK_EH_RESET_TIMER if the request should be left running * BLK_EH_NOT_HANDLED if the request is handled or terminated * by the driver. */ -enum blk_eh_timer_return dasd_times_out(struct request *req) +enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved) { struct dasd_ccw_req *cqr = req->completion_data; struct dasd_block *block = req->q->queuedata; struct dasd_device *device; + unsigned long flags; int rc = 0; if (!cqr) return BLK_EH_NOT_HANDLED; + spin_lock_irqsave(&cqr->dq->lock, flags); device = cqr->startdev ? cqr->startdev : block->base; - if (!device->blk_timeout) + if (!device->blk_timeout) { + spin_unlock_irqrestore(&cqr->dq->lock, flags); return BLK_EH_RESET_TIMER; + } DBF_DEV_EVENT(DBF_WARNING, device, " dasd_times_out cqr %p status %x", cqr, cqr->status); @@ -3154,19 +3137,64 @@ enum blk_eh_timer_return dasd_times_out(struct request *req) } dasd_schedule_block_bh(block); spin_unlock(&block->queue_lock); + spin_unlock_irqrestore(&cqr->dq->lock, flags); return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; } +static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int idx) +{ + struct dasd_queue *dq = kzalloc(sizeof(*dq), GFP_KERNEL); + + if (!dq) + return -ENOMEM; + + spin_lock_init(&dq->lock); + hctx->driver_data = dq; + + return 0; +} + +static void dasd_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int idx) +{ + kfree(hctx->driver_data); + hctx->driver_data = NULL; +} + +static void dasd_request_done(struct request *req) +{ + blk_mq_end_request(req, 0); + blk_mq_run_hw_queues(req->q, true); +} + +static struct blk_mq_ops dasd_mq_ops = { + .queue_rq = do_dasd_request, + .complete = dasd_request_done, + .timeout = dasd_times_out, + .init_hctx = dasd_init_hctx, + .exit_hctx = dasd_exit_hctx, +}; + /* * Allocate and initialize request queue and default I/O scheduler. */ static int dasd_alloc_queue(struct dasd_block *block) { - block->request_queue = blk_init_queue(do_dasd_request, - &block->request_queue_lock); - if (block->request_queue == NULL) - return -ENOMEM; + int rc; + + block->tag_set.ops = &dasd_mq_ops; + block->tag_set.nr_hw_queues = DASD_NR_HW_QUEUES; + block->tag_set.queue_depth = DASD_MAX_LCU_DEV * DASD_REQ_PER_DEV; + block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + + rc = blk_mq_alloc_tag_set(&block->tag_set); + if (rc) + return rc; + + block->request_queue = blk_mq_init_queue(&block->tag_set); + if (IS_ERR(block->request_queue)) + return PTR_ERR(block->request_queue); block->request_queue->queuedata = block; @@ -3229,26 +3257,11 @@ static void dasd_free_queue(struct dasd_block *block) { if (block->request_queue) { blk_cleanup_queue(block->request_queue); + blk_mq_free_tag_set(&block->tag_set); block->request_queue = NULL; } } -/* - * Flush request on the request queue. - */ -static void dasd_flush_request_queue(struct dasd_block *block) -{ - struct request *req; - - if (!block->request_queue) - return; - - spin_lock_irq(&block->request_queue_lock); - while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, BLK_STS_IOERR); - spin_unlock_irq(&block->request_queue_lock); -} - static int dasd_open(struct block_device *bdev, fmode_t mode) { struct dasd_device *base; @@ -3744,8 +3757,10 @@ int dasd_generic_path_operational(struct dasd_device *device) return 1; } dasd_schedule_device_bh(device); - if (device->block) + if (device->block) { dasd_schedule_block_bh(device->block); + blk_mq_run_hw_queues(device->block->request_queue, true); + } if (!device->stopped) wake_up(&generic_waitq); @@ -4008,8 +4023,10 @@ int dasd_generic_restore_device(struct ccw_device *cdev) */ device->stopped |= DASD_UNRESUMED_PM; - if (device->block) + if (device->block) { dasd_schedule_block_bh(device->block); + blk_mq_run_hw_queues(device->block->request_queue, true); + } clear_bit(DASD_FLAG_SUSPENDED, &device->flags); dasd_put_device(device); diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index e38042ce94e6..c95a4784c191 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -1326,7 +1326,7 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr, { struct dasd_device *device; struct request_queue *q; - unsigned long val, flags; + unsigned long val; device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device) || !device->block) @@ -1342,16 +1342,10 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr, dasd_put_device(device); return -ENODEV; } - spin_lock_irqsave(&device->block->request_queue_lock, flags); - if (!val) - blk_queue_rq_timed_out(q, NULL); - else - blk_queue_rq_timed_out(q, dasd_times_out); device->blk_timeout = val; blk_queue_rq_timeout(q, device->blk_timeout * HZ); - spin_unlock_irqrestore(&device->block->request_queue_lock, flags); dasd_put_device(device); return count; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index f9e25fc03d6b..db470bd10175 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -56,6 +56,7 @@ #include #include #include +#include /* DASD discipline magic */ #define DASD_ECKD_MAGIC 0xC5C3D2C4 @@ -185,6 +186,7 @@ struct dasd_ccw_req { char status; /* status of this request */ short retries; /* A retry counter */ unsigned long flags; /* flags of this request */ + struct dasd_queue *dq; /* ... and how */ unsigned long starttime; /* jiffies time of request start */ @@ -248,6 +250,16 @@ struct dasd_ccw_req { #define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */ #define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */ +/* + * There is no reliable way to determine the number of available CPUs on + * LPAR but there is no big performance difference between 1 and the + * maximum CPU number. + * 64 is a good trade off performance wise. + */ +#define DASD_NR_HW_QUEUES 64 +#define DASD_MAX_LCU_DEV 256 +#define DASD_REQ_PER_DEV 4 + /* Signature for error recovery functions. */ typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *); @@ -539,6 +551,7 @@ struct dasd_block { struct gendisk *gdp; struct request_queue *request_queue; spinlock_t request_queue_lock; + struct blk_mq_tag_set tag_set; struct block_device *bdev; atomic_t open_count; @@ -563,6 +576,10 @@ struct dasd_attention_data { __u8 lpum; }; +struct dasd_queue { + spinlock_t lock; +}; + /* reasons why device (ccw_device_start) was stopped */ #define DASD_STOPPED_NOT_ACC 1 /* not accessible */ #define DASD_STOPPED_QUIESCE 2 /* Quiesced */ @@ -731,7 +748,7 @@ void dasd_free_device(struct dasd_device *); struct dasd_block *dasd_alloc_block(void); void dasd_free_block(struct dasd_block *); -enum blk_eh_timer_return dasd_times_out(struct request *req); +enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved); void dasd_enable_device(struct dasd_device *); void dasd_set_target_state(struct dasd_device *, int);