Pulpito :: Results Dashboard

Posted	Started	Updated	Runtime	Duration	In Waiting	Machine	Teuthology Branch	OS Type	OS Version	Nodes	Status
2023-05-28 15:21:07	2023-05-29 00:23:44	2023-05-29 01:17:16	0:53:32	0:43:48	0:09:44	smithi	main	centos	8.stream	2	pass

Nodes: smithi032 smithi164

Description: orch:cephadm/upgrade/{1-start-distro/1-start-centos_8.stream_container-tools 2-repo_digest/defaut 3-upgrade/staggered 4-wait 5-upgrade-ls agent/on mon_election/classic}

Log: http://qa-proxy.ceph.com/teuthology/adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi/7289325/teuthology.log

All Details...

log_href: http://qa-proxy.ceph.com/teuthology/adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi/7289325/teuthology.log
archive_path: /home/teuthworker/archive/adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi/7289325
description: orch:cephadm/upgrade/{1-start-distro/1-start-centos_8.stream_container-tools 2-repo_digest/defaut 3-upgrade/staggered 4-wait 5-upgrade-ls agent/on mon_election/classic}
duration: 0:43:48
email: adking@redhat.com
failure_reason:
flavor:
job_id: 7289325
kernel:
- kdb: True
- sha1: distro
last_in_suite: False
machine_type: smithi
name: adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi
nuke_on_error: True
os_type: centos
os_version: 8.stream
overrides:
- admin_socket:
  - branch: wip-adk-testing-2023-05-27-2231
- ceph:
  - conf:
    - global:
      - mon election default strategy: 1
    - mgr:
      - debug mgr: 20
      - debug ms: 1
      - mgr/cephadm/use_agent: True
    - mon:
      - debug mon: 20
      - debug ms: 1
      - debug paxos: 20
    - osd:
      - debug ms: 1
      - debug osd: 20
  - flavor: default
  - log-ignorelist:
    - $MDS_ALL_DOWN$
    - $MDS_UP_LESS_THAN_MAX$
  - sha1: bcb4c52075444555d801af0eb790e902aa9b890f
- ceph-deploy:
  - conf:
    - client:
      - log file: /var/log/ceph/ceph-$name.$pid.log
    - mon:
      - osd default pool size: 2
- install:
  - ceph:
    - flavor: default
    - sha1: bcb4c52075444555d801af0eb790e902aa9b890f
- workunit:
  - branch: wip-adk-testing-2023-05-27-2231
  - sha1: bcb4c52075444555d801af0eb790e902aa9b890f
owner: scheduled_adking@teuthology
pid:
roles:
- ['mon.a', 'mon.c', 'mgr.y', 'osd.0', 'osd.1', 'osd.2', 'osd.3', 'client.0', 'node-exporter.a', 'alertmanager.a']
- ['mon.b', 'mgr.x', 'osd.4', 'osd.5', 'osd.6', 'osd.7', 'client.1', 'prometheus.a', 'grafana.a', 'node-exporter.b']
sentry_event:
status: pass
success: True
branch: wip-adk-testing-2023-05-27-2231
seed:
sha1: bcb4c52075444555d801af0eb790e902aa9b890f
subset:
suite:
suite_branch: wip-adk-testing-2023-05-27-2231
suite_path:
suite_relpath:
suite_repo:
suite_sha1: bcb4c52075444555d801af0eb790e902aa9b890f
targets:
- smithi032.front.sepia.ceph.com: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC4fn5nFVIanoXdrYE1pYSzpgnyj9/UAkGvWZQcES657/dOoRVcoBbQL75sBelJCCyh5OT0T1SSashhR6NQ4OcWUnyFeI9QCzy78cARnIUPDLa+MbkBfBBHds3UQkoe+hxWpJx7EGfUwq4aD449qjN8Ltxac3DK4InSe8Ekw9kUqr8rEJFtAc5sg+HZ3NxTrWZnDhPA3rQuJ7RlGrR0rihoWgjsBorNaL8vIewSU+/uOZAIY+91OPu2YM/hmHbQAKUwDNP15jW5U2lZfpsrg+QNYLBaoQ12o18pI+tSyy9XyCv3kqcN4sWihEzI5Yx0stOn5/XJyEfn6wd6qqhwa4FvTf4BbMaI4vkIKNU4VnY9xvuKzYTSKIEsTSKJn3XRY8Q7tebO/3Ts/rFvRAyu3nsEY6GWTxnH1rTDXrJmaimxch+tnW9VtyfpOMu1eWNvBdtln3rQ9InRYk68SI8HMXe66OCanlgv2DAhqHfpFsJy8Zqxn53+HJEFWzvPfSeZJgs=
- smithi164.front.sepia.ceph.com: ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFms6V/uigRQSanw/3CWhCfxzNIqzL9v4im2PXsfY+TC8WM+raw3ohGO1iuVUwEEhBGn4Hu70iVhDYdww1fujUE=
tasks:
- internal.check_packages:
- internal.buildpackages_prep:
- internal.save_config:
- internal.check_lock:
- internal.add_remotes:
- console_log:
- internal.connect:
- internal.push_inventory:
- internal.serialize_remote_roles:
- internal.check_conflict:
- internal.check_ceph_data:
- internal.vm_setup:
- kernel:
  - kdb: True
  - sha1: distro
- internal.base:
- internal.archive_upload:
- internal.archive:
- internal.coredump:
- internal.sudo:
- internal.syslog:
- internal.timer:
- pcp:
- selinux:
- ansible.cephlab:
- clock:
- pexec:
  - all:
    - sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
    - sudo dnf -y module reset container-tools
    - sudo dnf -y module install container-tools
    - sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf
- cephadm:
  - allow_ptrace: False
  - avoid_pacific_features: True
  - cephadm_branch: v16.2.0
  - cephadm_git_url: https://github.com/ceph/ceph
  - image: quay.io/ceph/ceph:v16.2.0
  - conf:
    - global:
      - mon election default strategy: 1
    - mgr:
      - debug mgr: 20
      - debug ms: 1
      - mgr/cephadm/use_agent: True
    - mon:
      - debug mon: 20
      - debug ms: 1
      - debug paxos: 20
    - osd:
      - debug ms: 1
      - debug osd: 20
  - flavor: default
  - log-ignorelist:
    - $MDS_ALL_DOWN$
    - $MDS_UP_LESS_THAN_MAX$
  - sha1: bcb4c52075444555d801af0eb790e902aa9b890f
  - cluster: ceph
  - cephadm_mode: root
- cephadm.shell:
  - mon.a:
    - set -ex # setup rgw radosgw-admin realm create --rgw-realm=r --default radosgw-admin zonegroup create --rgw-zonegroup=default --master --default radosgw-admin zone create --rgw-zonegroup=default --rgw-zone=z --master --default radosgw-admin period update --rgw-realm=r --commit ceph orch apply rgw foo --realm r --zone z --placement=2 --port=8000 # setup iscsi ceph osd pool create foo rbd pool init foo ceph orch apply iscsi foo u p sleep 180 ceph config set mon mon_warn_on_insecure_global_id_reclaim false --force ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed false --force ceph config set global log_to_journald false --force # get some good info on the state of things pre-upgrade. Useful for debugging ceph orch ps ceph versions ceph -s ceph orch ls # collect the target id for the container we are upgrading to TARGET_ID="$(ceph orch upgrade check --image quay.ceph.io/ceph-ci/ceph:$sha1 | jq -r '.target_id')" echo "$TARGET_ID" # doing staggered upgrade requires mgr daemons being on a version that contains the staggered upgrade code # until there is a stable version that contains it, we can test by manually upgrading a mgr daemon ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1 ceph orch ps --refresh sleep 180 # gather more possible debugging info ceph orch ps ceph versions ceph -s ceph health detail # verify we have upgraded exactly 1 of the 2 mgr daemons to the new image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "1" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 1 mgr with new container image id"; fi # verify exactly 1 mgr is not upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id!=$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "1" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 1 mgr with old container image id"; fi ceph mgr fail sleep 180 # now try upgrading the other mgr ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1 ceph orch ps --refresh sleep 180 # gather more possible debugging info ceph orch ps ceph versions ceph health detail ceph -s ceph mgr fail sleep 180 # gather more debugging info ceph orch ps ceph versions ceph -s ceph health detail # now that both mgrs should have been redeployed with the new version, so should find 2 daemons # when matching against mgr daemons on the correct image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 2 mgr with new container image id"; fi ceph mgr fail sleep 180 # debugging info ceph orch ps ceph orch ls ceph versions # to make sure mgr daemons upgrade is fully completed, including being deployed by a mgr on new version # also serves as an early failure if manually upgrading the mgrs failed as --daemon-types won't be recognized ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mgr while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done # verify 2 mgr daemons both on the new container image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 2 mgr with new container image id"; fi # verify non-mgr daemons are still on old image id to make sure --daemon-types was respected ! ceph orch ps --format json | jq -e '.[] | select(.daemon_type!="mgr") | .container_image_id' | grep $TARGET_ID # check that exactly two daemons have been upgraded to the new image (our 2 mgr daemons) ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 2' ceph orch upgrade status ceph health detail # upgrade only the mons on one of the two hosts ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mon --hosts $(ceph orch ps | grep mgr.x | awk '{print $2}') while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify exactly 1 off the 2 mon daemons was upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mon") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "1" ]; then echo "Found unexpected number of upgraded mon daemons"; exit 1; else echo "Matched 1 mon with new container image id"; fi ceph orch upgrade status ceph health detail # upgrade mons on the other hosts ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mon --hosts $(ceph orch ps | grep mgr.y | awk '{print $2}') while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify all mons (3) now on same version and version hash matches what we are upgrading to matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mon") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "3" ]; then echo "Found unexpected number of upgraded mon daemons"; exit 1; else echo "Matched 3 mon with new container image id"; fi # verify exactly 5 daemons are now upgraded (2 mgrs, 3 mons) ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 5' ceph orch upgrade status ceph health detail # upgrade exactly 2 osd daemons ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types osd --limit 2 while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify exactly 2 of the 8 OSDs were upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="osd") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded osd daemons"; exit 1; else echo "Matched 2 osd with new container image id"; fi # verify exactly 7 daemons have been upgraded (2 mgrs, 3 mons, 2 osds) ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 7' ceph orch upgrade status ceph health detail # upgrade one more osd ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types crash,osd --limit 1 while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify 3 osd daemons have been upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="osd") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "3" ]; then echo "Found unexpected number of upgraded osd daemons"; exit 1; else echo "Matched 3 osd with new container image id"; fi # verify now 8 daemons have been upgraded ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 8' # upgrade the rest of the osds # use this opportunity to check we can set osd flags properly ceph orch upgrade status ceph health detail # make sure noout is listed as a flag to be set as that is what we'll test with ceph config get mgr mgr/cephadm/upgrade_osd_flags | grep noout # upgrade osds and crash daemons. ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types crash,osd # wait for upgrade to be started and in progress to check for osd flags # To test noout being set during upgrade, want to loop here until either the upgrade completes, # fails with an error, or noout is set, but in the noout case, we need to do something to mark # that that was the condition that we stopped looping on. Doing that here by having # it create a file whose existence we can check for once the loop is over while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do if ceph osd dump -f json | jq '.flags_set' | grep noout; then touch saw_noout.txt; break; else echo "no noout yet"; fi; sleep 1; done ls | grep saw_noout # wait for upgrade to complete while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify noout was unset once upgrade completed if ceph osd dump -f json | jq '.flags_set' | grep noout; then (exit 1); else (exit 0); fi # verify all 8 osds are on the new container image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="osd") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "8" ]; then echo "Found unexpected number of upgraded osd daemons"; exit 1; else echo "Matched 8 osd with new container image id"; fi ceph orch upgrade status ceph health detail # upgrade the rgw daemons using --services ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --services rgw.foo while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify all 2 rgw daemons were upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="rgw") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded rgw daemons"; exit 1; else echo "Matched 2 rgw with new container image id"; fi ceph orch upgrade status ceph health detail # run upgrade one more time with no filter parameters to make sure anything left gets upgraded ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
- cephadm.shell:
  - mon.a:
    - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; ceph health detail ; sleep 30 ; done
    - ceph orch ps
    - ceph versions
    - echo "wait for servicemap items w/ changing names to refresh"
    - sleep 60
    - ceph orch ps
    - ceph versions
    - ceph orch upgrade status
    - ceph health detail
    - ceph versions | jq -e '.overall | length == 1'
    - ceph versions | jq -e '.overall | keys' | grep $sha1
    - ceph orch ls | grep '^osd '
- cephadm.shell:
  - mon.a:
    - ceph orch upgrade ls
    - ceph orch upgrade ls --image quay.io/ceph/ceph --show-all-versions | grep 16.2.0
    - ceph orch upgrade ls --image quay.io/ceph/ceph --tags | grep v16.2.2
teuthology_branch: main
verbose: False
pcp_grafana_url:
priority:
user:
queue:
posted: 2023-05-28 15:21:07
started: 2023-05-29 00:23:44
updated: 2023-05-29 01:17:16
status_class: success
runtime: 0:53:32
wait_time: 0:09:44