chore: Make jupyter integration tests less flaky and avoid hang (#23134)

There's a TOCTOU issue that can happen when selecting unused ports for
the server to use (we get assigned an unused port by the OS, and between
then and when the server actually binds to the port another test steals
it). Improve this by checking if the server existed soon after setup,
and if so we retry starting it. Client connection can also fail
spuriously (in local testing) so added a retry mechanism.

This also fixes a hang, where if the server exited (almost always due to
the issue described above) before we connected to it, attempting to
connect our client ZMQ sockets to it would just hang. To resolve this, I
added a timeout so we can't wait forever.
This commit is contained in:
Nathan Whitaker 2024-03-29 23:23:48 -07:00 committed by GitHub
parent 524e451bfb
commit 99720d0713
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -191,7 +191,24 @@ async fn connect_socket<S: zeromq::Socket>(
) -> S {
let addr = spec.endpoint(port);
let mut socket = S::new();
socket.connect(&addr).await.unwrap();
let mut connected = false;
for _ in 0..5 {
match timeout(Duration::from_secs(5), socket.connect(&addr)).await {
Ok(Ok(_)) => {
connected = true;
break;
}
Ok(Err(e)) => {
eprintln!("Failed to connect to {addr}: {e}");
}
Err(e) => {
eprintln!("Timed out connecting to {addr}: {e}");
}
}
}
if !connected {
panic!("Failed to connect to {addr}");
}
socket
}
@ -369,27 +386,48 @@ impl Drop for JupyterServerProcess {
}
}
fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) {
async fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) {
let context = TestContextBuilder::new().use_temp_cwd().build();
let conn = ConnectionSpec::default();
let mut conn = ConnectionSpec::default();
let conn_file = context.temp_dir().path().join("connection.json");
conn_file.write_json(&conn);
let process = context
.new_command()
.piped_output()
.args_vec(vec![
"jupyter",
"--kernel",
"--conn",
conn_file.to_string().as_str(),
])
.spawn()
.unwrap();
let start_process = |conn_file: &test_util::PathRef| {
context
.new_command()
.args_vec(vec![
"jupyter",
"--kernel",
"--conn",
conn_file.to_string().as_str(),
])
.spawn()
.unwrap()
};
// try to start the server, retrying up to 5 times
// (this can happen due to TOCTOU errors with selecting unused TCP ports)
let mut process = start_process(&conn_file);
tokio::time::sleep(Duration::from_millis(1000)).await;
for _ in 0..5 {
if process.try_wait().unwrap().is_none() {
break;
} else {
conn = ConnectionSpec::default();
conn_file.write_json(&conn);
process = start_process(&conn_file);
tokio::time::sleep(Duration::from_millis(1000)).await;
}
}
if process.try_wait().unwrap().is_some() {
panic!("Failed to start Jupyter server");
}
(context, conn, JupyterServerProcess(Some(process)))
}
async fn setup() -> (TestContext, JupyterClient, JupyterServerProcess) {
let (context, conn, process) = setup_server();
let (context, conn, process) = setup_server().await;
let client = JupyterClient::new(&conn).await;
client.io_subscribe("").await.unwrap();