net/mlx5: Avoid double clear or set of sync reset requested

Double clear of reset requested state can lead to NULL pointer as it
will try to delete the timer twice. This can happen for example on a
race between abort from FW and pci error or reset. Avoid such case using
test_and_clear_bit() to verify only one time reset requested state clear
flow. Similarly use test_and_set_bit() to verify only one time reset
requested state set flow.

Fixes: 7dd6df329d ("net/mlx5: Handle sync reset abort event")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
Moshe Shemesh 2022-04-11 20:38:44 +03:00 committed by Saeed Mahameed
parent cb7786a76e
commit fc3d3db07b

View file

@ -162,14 +162,19 @@ static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
del_timer_sync(&fw_reset->timer);
}
static void mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health)
static int mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health)
{
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
if (!test_and_clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) {
mlx5_core_warn(dev, "Reset request was already cleared\n");
return -EALREADY;
}
mlx5_stop_sync_reset_poll(dev);
clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
if (poll_health)
mlx5_start_health_poll(dev);
return 0;
}
static void mlx5_sync_reset_reload_work(struct work_struct *work)
@ -229,13 +234,17 @@ static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev)
return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 2, false);
}
static void mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev)
static int mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev)
{
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
if (test_and_set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) {
mlx5_core_warn(dev, "Reset request was already set\n");
return -EALREADY;
}
mlx5_stop_health_poll(dev, true);
set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags);
mlx5_start_sync_reset_poll(dev);
return 0;
}
static void mlx5_fw_live_patch_event(struct work_struct *work)
@ -264,7 +273,9 @@ static void mlx5_sync_reset_request_event(struct work_struct *work)
err ? "Failed" : "Sent");
return;
}
mlx5_sync_reset_set_reset_requested(dev);
if (mlx5_sync_reset_set_reset_requested(dev))
return;
err = mlx5_fw_reset_set_reset_sync_ack(dev);
if (err)
mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err);
@ -362,7 +373,8 @@ static void mlx5_sync_reset_now_event(struct work_struct *work)
struct mlx5_core_dev *dev = fw_reset->dev;
int err;
mlx5_sync_reset_clear_reset_requested(dev, false);
if (mlx5_sync_reset_clear_reset_requested(dev, false))
return;
mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n");
@ -391,10 +403,8 @@ static void mlx5_sync_reset_abort_event(struct work_struct *work)
reset_abort_work);
struct mlx5_core_dev *dev = fw_reset->dev;
if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags))
if (mlx5_sync_reset_clear_reset_requested(dev, true))
return;
mlx5_sync_reset_clear_reset_requested(dev, true);
mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n");
}