Files
ProjectNautilus/ground/src/telemetry.rs

195 lines
7.3 KiB
Rust

use api::client::Client;
use api::client::telemetry::{TelemetryHandle, TelemetryRegistry};
use futures::future::{Either, join_all, select};
use futures::{TryFutureExt, pin_mut};
use log::{error, trace};
use nautilus_common::on_drop::on_drop;
use nautilus_common::telemetry::{CommsState, SwitchBank, Telemetry, TelemetryMessage};
use nautilus_common::udp::UdpRecvPostcardError;
use nautilus_common::udp::tokio::AsyncUdpSocketExt;
use std::net::SocketAddr;
use std::sync::Arc;
use tokio::net::UdpSocket;
use tokio::sync::{RwLock, RwLockWriteGuard};
use tokio::task::yield_now;
use tokio::{join, select, try_join};
use tokio_util::sync::CancellationToken;
pub struct TelemetryHandler<'a> {
tlm: TelemetryRegistry,
flight_addr: &'a RwLock<SocketAddr>,
lock: Option<RwLockWriteGuard<'a, SocketAddr>>,
udp: &'a UdpSocket,
cancel: CancellationToken,
}
impl<'a> TelemetryHandler<'a> {
pub fn new(
client: Arc<Client>,
flight_addr: &'a RwLock<SocketAddr>,
lock: RwLockWriteGuard<'a, SocketAddr>,
udp: &'a UdpSocket,
cancel: CancellationToken,
) -> Self {
Self {
tlm: TelemetryRegistry::new(client),
flight_addr,
lock: Some(lock),
udp,
cancel,
}
}
/// Run this telemetry handler.
/// Note: this method is expected to block so should run in its own thread
pub fn run(mut self) -> anyhow::Result<()> {
let cancel = self.cancel.clone();
// Trigger a shutdown if we exit the telemetry process for some reason (including panic)
let _shutdown_when_closed = on_drop(move || cancel.cancel());
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
runtime.block_on(async {
// Allow cancellation while waiting for this to initialize
let mut context = {
let context = TelemetryContext::new(&self);
pin_mut!(context);
let cancellation_future = self.cancel.cancelled();
pin_mut!(cancellation_future);
let init_or_cancel = select(context, cancellation_future).await;
match init_or_cancel {
Either::Left((success, _)) => success,
Either::Right(_) => return Ok(()),
}
};
let mut buffer = [0u8; 512];
while !self.cancel.is_cancelled() {
select! {
() = self.cancel.cancelled() => {},
incoming = self.udp.recv_postcard::<Telemetry>(&mut buffer) => {
match incoming {
Ok((tlm, addr, _)) => {
trace!("{tlm:?}");
let flight_addr_update = async {
// The first time around we will use the lock given to us
// Other times we will grab a new lock
// self.re
let mut lock = match self.lock.take() {
None => self.flight_addr.write().await,
Some(lock) => lock,
};
// Update the value in the lock
*lock = addr;
};
tokio::pin!(flight_addr_update);
// We can do these two operations concurrently
join!(
flight_addr_update,
context.step(tlm),
);
},
Err(UdpRecvPostcardError::NoData) => {
// This shouldn't be possible when using a tokio socket I don't think
// But let's just yield our time anyways
yield_now().await;
},
Err(err) => {
error!("Rx error: {err}");
},
}
}
}
}
// Explicit Drop
drop(self);
Ok(())
})
}
}
struct TelemetryContext {
bank_a: Vec<TelemetryHandle<bool>>,
bank_b: Vec<TelemetryHandle<bool>>,
tx_packets: TelemetryHandle<u32>,
rx_packets: TelemetryHandle<u32>,
tx_bytes: TelemetryHandle<u32>,
rx_bytes: TelemetryHandle<u32>,
tx_errors: TelemetryHandle<u32>,
rx_errors: TelemetryHandle<u32>,
}
impl TelemetryContext {
async fn new(tlm: &TelemetryHandler<'_>) -> Self {
let bank_a =
join_all((0..16).map(|i| tlm.tlm.register(format!("switch.bank_a.{i}")))).await;
let bank_b =
join_all((0..16).map(|i| tlm.tlm.register(format!("switch.bank_b.{i}")))).await;
let tx_packets = tlm.tlm.register("comms.tx_packets").await;
let rx_packets = tlm.tlm.register("comms.rx_packets").await;
let tx_bytes = tlm.tlm.register("comms.tx_bytes").await;
let rx_bytes = tlm.tlm.register("comms.rx_bytes").await;
let tx_errors = tlm.tlm.register("comms.tx_errors").await;
let rx_errors = tlm.tlm.register("comms.rx_errors").await;
Self {
bank_a,
bank_b,
tx_packets,
rx_packets,
tx_bytes,
rx_bytes,
tx_errors,
rx_errors,
}
}
async fn step(&mut self, tlm: Telemetry) {
match tlm.message {
TelemetryMessage::SwitchState { bank, switches } => {
let bank_handles = match bank {
SwitchBank::A => &self.bank_a,
SwitchBank::B => &self.bank_b,
};
assert!(bank_handles.len() >= switches.iter().len());
join_all(switches.into_iter().enumerate().map(|(i, state)| {
bank_handles[i]
.publish(state, tlm.timestamp)
.unwrap_or_else(move |err| {
// We don't need to bubble this error up, just report it
error!("Failed to publish telemetry for switch {bank} {i}: {err}");
})
}))
.await;
}
TelemetryMessage::CommsState(CommsState {
tx_packets,
rx_packets,
tx_bytes,
rx_bytes,
tx_errors,
rx_errors,
}) => {
if let Err(e) = try_join!(
self.tx_packets.publish(tx_packets, tlm.timestamp),
self.rx_packets.publish(rx_packets, tlm.timestamp),
self.tx_bytes.publish(tx_bytes, tlm.timestamp),
self.rx_bytes.publish(rx_bytes, tlm.timestamp),
self.tx_errors.publish(tx_errors, tlm.timestamp),
self.rx_errors.publish(rx_errors, tlm.timestamp),
) {
error!("Failed to publish comms telemetry: {e}");
}
}
}
}
}