Posted | Started | Updated | Runtime |
Duration |
In Waiting |
Machine | Teuthology Branch | OS Type | OS Version | Nodes | Status |
---|---|---|---|---|---|---|---|---|---|---|---|
2024-09-01 00:02:02 | 2024-09-01 00:41:06 | 2024-09-01 01:03:07 | 0:22:01 | 0:15:14 | 0:06:47 | smithi | main | centos | 9.stream | 3 | fail |
Description: rados/cephadm/workunits/{0-distro/centos_9.stream_runc agent/off mon_election/classic task/test_monitoring_stack_basic}
Sentry event: https://sentry.ceph.com/organizations/ceph/?query=e59b73d664e5407ca040f65fcad30ddf
Command failed on smithi110 with status 5: 'sudo /home/ubuntu/cephtest/cephadm --image quay-quay-quay.apps.os.sepia.ceph.com/ceph-ci/ceph:5028887063e8eb518c95117479aa913eaa12ac8e shell -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring --fsid 5f70a1f2-67fc-11ef-bcd4-c7b262605968 -- bash -c \'set -e\nset -x\nceph orch apply node-exporter\nceph orch apply grafana\nceph orch apply alertmanager\nceph orch apply prometheus\nsleep 240\nceph orch ls\nceph orch ps\nceph orch host ls\nMON_DAEMON=$(ceph orch ps --daemon-type mon -f json | jq -r \'"\'"\'last | .daemon_name\'"\'"\')\nGRAFANA_HOST=$(ceph orch ps --daemon-type grafana -f json | jq -e \'"\'"\'.[]\'"\'"\' | jq -r \'"\'"\'.hostname\'"\'"\')\nPROM_HOST=$(ceph orch ps --daemon-type prometheus -f json | jq -e \'"\'"\'.[]\'"\'"\' | jq -r \'"\'"\'.hostname\'"\'"\')\nALERTM_HOST=$(ceph orch ps --daemon-type alertmanager -f json | jq -e \'"\'"\'.[]\'"\'"\' | jq -r \'"\'"\'.hostname\'"\'"\')\nGRAFANA_IP=$(ceph orch host ls -f json | jq -r --arg GRAFANA_HOST "$GRAFANA_HOST" \'"\'"\'.[] | select(.hostname==$GRAFANA_HOST) | .addr\'"\'"\')\nPROM_IP=$(ceph orch host ls -f json | jq -r --arg PROM_HOST "$PROM_HOST" \'"\'"\'.[] | select(.hostname==$PROM_HOST) | .addr\'"\'"\')\nALERTM_IP=$(ceph orch host ls -f json | jq -r --arg ALERTM_HOST "$ALERTM_HOST" \'"\'"\'.[] | select(.hostname==$ALERTM_HOST) | .addr\'"\'"\')\n# check each host node-exporter metrics endpoint is responsive\nALL_HOST_IPS=$(ceph orch host ls -f json | jq -r \'"\'"\'.[] | .addr\'"\'"\')\nfor ip in $ALL_HOST_IPS; do\n curl -s http://${ip}:9100/metric\ndone\n# check grafana endpoints are responsive and database health is okay\ncurl -k -s https://${GRAFANA_IP}:3000/api/health\ncurl -k -s https://${GRAFANA_IP}:3000/api/health | jq -e \'"\'"\'.database == "ok"\'"\'"\'\n# stop mon daemon in order to trigger an alert\nceph orch daemon stop $MON_DAEMON\nsleep 120\n# check prometheus endpoints are responsive and mon down alert is firing\ncurl -s http://${PROM_IP}:9095/api/v1/status/config\ncurl -s http://${PROM_IP}:9095/api/v1/status/config | jq -e \'"\'"\'.status == "success"\'"\'"\'\ncurl -s http://${PROM_IP}:9095/api/v1/alerts\ncurl -s http://${PROM_IP}:9095/api/v1/alerts | jq -e \'"\'"\'.data | .alerts | .[] | select(.labels | .alertname == "CephMonDown") | .state == "firing"\'"\'"\'\n# check alertmanager endpoints are responsive and mon down alert is active\ncurl -s http://${ALERTM_IP}:9093/api/v1/status\ncurl -s http://${ALERTM_IP}:9093/api/v1/alerts\ncurl -s http://${ALERTM_IP}:9093/api/v1/alerts | jq -e \'"\'"\'.data | .[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"\'"\'"\'\n\''