chore: (Hopefully) make jupyter_execute_request test less flaky (#23689)

A bunch of small things, mostly around timing and making sure the
jupyter kernel is actually running and ready to respond to requests. I
reproduced the flakiness by running a script to run a bunch of instances
of the test in parallel, where I could get failures consistently. After
this PR, I can't reproduce the flakiness locally which hopefully means
that applies to CI as well
This commit is contained in:
Nathan Whitaker 2024-05-06 16:32:22 -07:00 committed by GitHub
parent 2dcbef2abb
commit 8eb1f11112
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -47,25 +47,43 @@ impl ConnectionSpec {
} }
} }
fn pick_unused_port() -> u16 { /// Gets an unused port from the OS, and returns the port number and a
/// `TcpListener` bound to that port. You can keep the listener alive
/// to prevent another process from binding to the port.
fn pick_unused_port() -> (u16, std::net::TcpListener) {
let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
listener.local_addr().unwrap().port() (listener.local_addr().unwrap().port(), listener)
} }
impl Default for ConnectionSpec { impl ConnectionSpec {
fn default() -> Self { fn new() -> (Self, Vec<std::net::TcpListener>) {
let mut listeners = Vec::new();
let (hb_port, listener) = pick_unused_port();
listeners.push(listener);
let (control_port, listener) = pick_unused_port();
listeners.push(listener);
let (shell_port, listener) = pick_unused_port();
listeners.push(listener);
let (stdin_port, listener) = pick_unused_port();
listeners.push(listener);
let (iopub_port, listener) = pick_unused_port();
listeners.push(listener);
(
Self { Self {
key: "".into(), key: "".into(),
signature_scheme: "hmac-sha256".into(), signature_scheme: "hmac-sha256".into(),
transport: "tcp".into(), transport: "tcp".into(),
ip: "127.0.0.1".into(), ip: "127.0.0.1".into(),
hb_port: pick_unused_port(), hb_port,
control_port: pick_unused_port(), control_port,
shell_port: pick_unused_port(), shell_port,
stdin_port: pick_unused_port(), stdin_port,
iopub_port: pick_unused_port(), iopub_port,
kernel_name: "deno".into(), kernel_name: "deno".into(),
} },
listeners,
)
} }
} }
@ -191,26 +209,16 @@ async fn connect_socket<S: zeromq::Socket>(
) -> S { ) -> S {
let addr = spec.endpoint(port); let addr = spec.endpoint(port);
let mut socket = S::new(); let mut socket = S::new();
let mut connected = false; match timeout(Duration::from_millis(5000), socket.connect(&addr)).await {
for _ in 0..5 { Ok(Ok(_)) => socket,
match timeout(Duration::from_secs(5), socket.connect(&addr)).await {
Ok(Ok(_)) => {
connected = true;
break;
}
Ok(Err(e)) => { Ok(Err(e)) => {
eprintln!("Failed to connect to {addr}: {e}"); panic!("Failed to connect to {addr}: {e}");
} }
Err(e) => { Err(e) => {
eprintln!("Timed out connecting to {addr}: {e}"); panic!("Timed out connecting to {addr}: {e}");
} }
} }
} }
if !connected {
panic!("Failed to connect to {addr}");
}
socket
}
#[derive(Clone)] #[derive(Clone)]
struct JupyterClient { struct JupyterClient {
@ -236,7 +244,7 @@ use JupyterChannel::*;
impl JupyterClient { impl JupyterClient {
async fn new(spec: &ConnectionSpec) -> Self { async fn new(spec: &ConnectionSpec) -> Self {
Self::new_with_timeout(spec, Duration::from_secs(5)).await Self::new_with_timeout(spec, Duration::from_secs(10)).await
} }
async fn new_with_timeout(spec: &ConnectionSpec, timeout: Duration) -> Self { async fn new_with_timeout(spec: &ConnectionSpec, timeout: Duration) -> Self {
@ -386,9 +394,36 @@ impl Drop for JupyterServerProcess {
} }
} }
async fn server_ready_on(addr: &str) -> bool {
matches!(
timeout(
Duration::from_millis(1000),
tokio::net::TcpStream::connect(addr.trim_start_matches("tcp://")),
)
.await,
Ok(Ok(_))
)
}
async fn server_ready(conn: &ConnectionSpec) -> bool {
let hb = conn.endpoint(conn.hb_port);
let control = conn.endpoint(conn.control_port);
let shell = conn.endpoint(conn.shell_port);
let stdin = conn.endpoint(conn.stdin_port);
let iopub = conn.endpoint(conn.iopub_port);
let (a, b, c, d, e) = tokio::join!(
server_ready_on(&hb),
server_ready_on(&control),
server_ready_on(&shell),
server_ready_on(&stdin),
server_ready_on(&iopub),
);
a && b && c && d && e
}
async fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) { async fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) {
let context = TestContextBuilder::new().use_temp_cwd().build(); let context = TestContextBuilder::new().use_temp_cwd().build();
let mut conn = ConnectionSpec::default(); let (mut conn, mut listeners) = ConnectionSpec::new();
let conn_file = context.temp_dir().path().join("connection.json"); let conn_file = context.temp_dir().path().join("connection.json");
conn_file.write_json(&conn); conn_file.write_json(&conn);
@ -405,22 +440,38 @@ async fn setup_server() -> (TestContext, ConnectionSpec, JupyterServerProcess) {
.unwrap() .unwrap()
}; };
// drop the listeners so the server can listen on the ports
drop(listeners);
// try to start the server, retrying up to 5 times // try to start the server, retrying up to 5 times
// (this can happen due to TOCTOU errors with selecting unused TCP ports) // (this can happen due to TOCTOU errors with selecting unused TCP ports)
let mut process = start_process(&conn_file); let mut process = start_process(&conn_file);
tokio::time::sleep(Duration::from_millis(1000)).await;
for _ in 0..5 { 'outer: for i in 0..10 {
// try to see if the server is healthy
for _ in 0..10 {
// server still running?
if process.try_wait().unwrap().is_none() { if process.try_wait().unwrap().is_none() {
break; // listening on all ports?
if server_ready(&conn).await {
// server is ready to go
break 'outer;
}
} else { } else {
conn = ConnectionSpec::default(); // server exited, try again
break;
}
tokio::time::sleep(Duration::from_millis(500)).await;
}
// pick new ports and try again
(conn, listeners) = ConnectionSpec::new();
conn_file.write_json(&conn); conn_file.write_json(&conn);
drop(listeners);
process = start_process(&conn_file); process = start_process(&conn_file);
tokio::time::sleep(Duration::from_millis(1000)).await; tokio::time::sleep(Duration::from_millis((i + 1) * 250)).await;
} }
} if process.try_wait().unwrap().is_some() || !server_ready(&conn).await {
if process.try_wait().unwrap().is_some() {
panic!("Failed to start Jupyter server"); panic!("Failed to start Jupyter server");
} }
(context, conn, JupyterServerProcess(Some(process))) (context, conn, JupyterServerProcess(Some(process)))
@ -430,6 +481,9 @@ async fn setup() -> (TestContext, JupyterClient, JupyterServerProcess) {
let (context, conn, process) = setup_server().await; let (context, conn, process) = setup_server().await;
let client = JupyterClient::new(&conn).await; let client = JupyterClient::new(&conn).await;
client.io_subscribe("").await.unwrap(); client.io_subscribe("").await.unwrap();
// make sure server is ready to receive messages
client.send_heartbeat(b"ping").await.unwrap();
let _ = client.recv_heartbeat().await.unwrap();
(context, client, process) (context, client, process)
} }
@ -530,7 +584,7 @@ async fn jupyter_execute_request() -> Result<()> {
Err(e) => { Err(e) => {
if e.downcast_ref::<tokio::time::error::Elapsed>().is_some() { if e.downcast_ref::<tokio::time::error::Elapsed>().is_some() {
// may timeout if we missed some messages // may timeout if we missed some messages
break; eprintln!("Timed out waiting for messages");
} }
panic!("Error: {:#?}", e); panic!("Error: {:#?}", e);
} }