Description: orch:cephadm/upgrade/{1-start-distro/1-start-centos_8.stream_container-tools 2-repo_digest/defaut 3-upgrade/staggered 4-wait 5-upgrade-ls agent/on mon_election/classic}

Log: http://qa-proxy.ceph.com/teuthology/adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi/7289325/teuthology.log

  • log_href: http://qa-proxy.ceph.com/teuthology/adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi/7289325/teuthology.log
  • archive_path: /home/teuthworker/archive/adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi/7289325
  • description: orch:cephadm/upgrade/{1-start-distro/1-start-centos_8.stream_container-tools 2-repo_digest/defaut 3-upgrade/staggered 4-wait 5-upgrade-ls agent/on mon_election/classic}
  • duration: 0:43:48
  • email: adking@redhat.com
  • failure_reason:
  • flavor:
  • job_id: 7289325
  • kernel:
    • kdb: True
    • sha1: distro
  • last_in_suite: False
  • machine_type: smithi
  • name: adking-2023-05-28_15:20:34-orch:cephadm-wip-adk-testing-2023-05-27-2231-distro-default-smithi
  • nuke_on_error: True
  • os_type: centos
  • os_version: 8.stream
  • overrides:
    • admin_socket:
      • branch: wip-adk-testing-2023-05-27-2231
    • ceph:
      • conf:
        • global:
          • mon election default strategy: 1
        • mgr:
          • debug mgr: 20
          • debug ms: 1
          • mgr/cephadm/use_agent: True
        • mon:
          • debug mon: 20
          • debug ms: 1
          • debug paxos: 20
        • osd:
          • debug ms: 1
          • debug osd: 20
      • flavor: default
      • log-ignorelist:
        • \(MDS_ALL_DOWN\)
        • \(MDS_UP_LESS_THAN_MAX\)
      • sha1: bcb4c52075444555d801af0eb790e902aa9b890f
    • ceph-deploy:
      • conf:
        • client:
          • log file: /var/log/ceph/ceph-$name.$pid.log
        • mon:
          • osd default pool size: 2
    • install:
      • ceph:
        • flavor: default
        • sha1: bcb4c52075444555d801af0eb790e902aa9b890f
    • workunit:
      • branch: wip-adk-testing-2023-05-27-2231
      • sha1: bcb4c52075444555d801af0eb790e902aa9b890f
  • owner: scheduled_adking@teuthology
  • pid:
  • roles:
    • ['mon.a', 'mon.c', 'mgr.y', 'osd.0', 'osd.1', 'osd.2', 'osd.3', 'client.0', 'node-exporter.a', 'alertmanager.a']
    • ['mon.b', 'mgr.x', 'osd.4', 'osd.5', 'osd.6', 'osd.7', 'client.1', 'prometheus.a', 'grafana.a', 'node-exporter.b']
  • sentry_event:
  • status: pass
  • success: True
  • branch: wip-adk-testing-2023-05-27-2231
  • seed:
  • sha1: bcb4c52075444555d801af0eb790e902aa9b890f
  • subset:
  • suite:
  • suite_branch: wip-adk-testing-2023-05-27-2231
  • suite_path:
  • suite_relpath:
  • suite_repo:
  • suite_sha1: bcb4c52075444555d801af0eb790e902aa9b890f
  • targets:
    • smithi032.front.sepia.ceph.com: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC4fn5nFVIanoXdrYE1pYSzpgnyj9/UAkGvWZQcES657/dOoRVcoBbQL75sBelJCCyh5OT0T1SSashhR6NQ4OcWUnyFeI9QCzy78cARnIUPDLa+MbkBfBBHds3UQkoe+hxWpJx7EGfUwq4aD449qjN8Ltxac3DK4InSe8Ekw9kUqr8rEJFtAc5sg+HZ3NxTrWZnDhPA3rQuJ7RlGrR0rihoWgjsBorNaL8vIewSU+/uOZAIY+91OPu2YM/hmHbQAKUwDNP15jW5U2lZfpsrg+QNYLBaoQ12o18pI+tSyy9XyCv3kqcN4sWihEzI5Yx0stOn5/XJyEfn6wd6qqhwa4FvTf4BbMaI4vkIKNU4VnY9xvuKzYTSKIEsTSKJn3XRY8Q7tebO/3Ts/rFvRAyu3nsEY6GWTxnH1rTDXrJmaimxch+tnW9VtyfpOMu1eWNvBdtln3rQ9InRYk68SI8HMXe66OCanlgv2DAhqHfpFsJy8Zqxn53+HJEFWzvPfSeZJgs=
    • smithi164.front.sepia.ceph.com: ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFms6V/uigRQSanw/3CWhCfxzNIqzL9v4im2PXsfY+TC8WM+raw3ohGO1iuVUwEEhBGn4Hu70iVhDYdww1fujUE=
  • tasks:
    • internal.check_packages:
    • internal.buildpackages_prep:
    • internal.save_config:
    • internal.check_lock:
    • internal.add_remotes:
    • console_log:
    • internal.connect:
    • internal.push_inventory:
    • internal.serialize_remote_roles:
    • internal.check_conflict:
    • internal.check_ceph_data:
    • internal.vm_setup:
    • kernel:
      • kdb: True
      • sha1: distro
    • internal.base:
    • internal.archive_upload:
    • internal.archive:
    • internal.coredump:
    • internal.sudo:
    • internal.syslog:
    • internal.timer:
    • pcp:
    • selinux:
    • ansible.cephlab:
    • clock:
    • pexec:
      • all:
        • sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
        • sudo dnf -y module reset container-tools
        • sudo dnf -y module install container-tools
        • sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf
    • cephadm:
      • allow_ptrace: False
      • avoid_pacific_features: True
      • cephadm_branch: v16.2.0
      • cephadm_git_url: https://github.com/ceph/ceph
      • image: quay.io/ceph/ceph:v16.2.0
      • conf:
        • global:
          • mon election default strategy: 1
        • mgr:
          • debug mgr: 20
          • debug ms: 1
          • mgr/cephadm/use_agent: True
        • mon:
          • debug mon: 20
          • debug ms: 1
          • debug paxos: 20
        • osd:
          • debug ms: 1
          • debug osd: 20
      • flavor: default
      • log-ignorelist:
        • \(MDS_ALL_DOWN\)
        • \(MDS_UP_LESS_THAN_MAX\)
      • sha1: bcb4c52075444555d801af0eb790e902aa9b890f
      • cluster: ceph
      • cephadm_mode: root
    • cephadm.shell:
      • mon.a:
        • set -ex # setup rgw radosgw-admin realm create --rgw-realm=r --default radosgw-admin zonegroup create --rgw-zonegroup=default --master --default radosgw-admin zone create --rgw-zonegroup=default --rgw-zone=z --master --default radosgw-admin period update --rgw-realm=r --commit ceph orch apply rgw foo --realm r --zone z --placement=2 --port=8000 # setup iscsi ceph osd pool create foo rbd pool init foo ceph orch apply iscsi foo u p sleep 180 ceph config set mon mon_warn_on_insecure_global_id_reclaim false --force ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed false --force ceph config set global log_to_journald false --force # get some good info on the state of things pre-upgrade. Useful for debugging ceph orch ps ceph versions ceph -s ceph orch ls # collect the target id for the container we are upgrading to TARGET_ID="$(ceph orch upgrade check --image quay.ceph.io/ceph-ci/ceph:$sha1 | jq -r '.target_id')" echo "$TARGET_ID" # doing staggered upgrade requires mgr daemons being on a version that contains the staggered upgrade code # until there is a stable version that contains it, we can test by manually upgrading a mgr daemon ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1 ceph orch ps --refresh sleep 180 # gather more possible debugging info ceph orch ps ceph versions ceph -s ceph health detail # verify we have upgraded exactly 1 of the 2 mgr daemons to the new image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "1" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 1 mgr with new container image id"; fi # verify exactly 1 mgr is not upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id!=$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "1" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 1 mgr with old container image id"; fi ceph mgr fail sleep 180 # now try upgrading the other mgr ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1 ceph orch ps --refresh sleep 180 # gather more possible debugging info ceph orch ps ceph versions ceph health detail ceph -s ceph mgr fail sleep 180 # gather more debugging info ceph orch ps ceph versions ceph -s ceph health detail # now that both mgrs should have been redeployed with the new version, so should find 2 daemons # when matching against mgr daemons on the correct image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 2 mgr with new container image id"; fi ceph mgr fail sleep 180 # debugging info ceph orch ps ceph orch ls ceph versions # to make sure mgr daemons upgrade is fully completed, including being deployed by a mgr on new version # also serves as an early failure if manually upgrading the mgrs failed as --daemon-types won't be recognized ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mgr while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done # verify 2 mgr daemons both on the new container image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mgr") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded manager daemons"; exit 1; else echo "Matched 2 mgr with new container image id"; fi # verify non-mgr daemons are still on old image id to make sure --daemon-types was respected ! ceph orch ps --format json | jq -e '.[] | select(.daemon_type!="mgr") | .container_image_id' | grep $TARGET_ID # check that exactly two daemons have been upgraded to the new image (our 2 mgr daemons) ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 2' ceph orch upgrade status ceph health detail # upgrade only the mons on one of the two hosts ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mon --hosts $(ceph orch ps | grep mgr.x | awk '{print $2}') while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify exactly 1 off the 2 mon daemons was upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mon") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "1" ]; then echo "Found unexpected number of upgraded mon daemons"; exit 1; else echo "Matched 1 mon with new container image id"; fi ceph orch upgrade status ceph health detail # upgrade mons on the other hosts ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mon --hosts $(ceph orch ps | grep mgr.y | awk '{print $2}') while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify all mons (3) now on same version and version hash matches what we are upgrading to matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="mon") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "3" ]; then echo "Found unexpected number of upgraded mon daemons"; exit 1; else echo "Matched 3 mon with new container image id"; fi # verify exactly 5 daemons are now upgraded (2 mgrs, 3 mons) ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 5' ceph orch upgrade status ceph health detail # upgrade exactly 2 osd daemons ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types osd --limit 2 while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify exactly 2 of the 8 OSDs were upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="osd") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded osd daemons"; exit 1; else echo "Matched 2 osd with new container image id"; fi # verify exactly 7 daemons have been upgraded (2 mgrs, 3 mons, 2 osds) ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 7' ceph orch upgrade status ceph health detail # upgrade one more osd ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types crash,osd --limit 1 while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify 3 osd daemons have been upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="osd") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "3" ]; then echo "Found unexpected number of upgraded osd daemons"; exit 1; else echo "Matched 3 osd with new container image id"; fi # verify now 8 daemons have been upgraded ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 8' # upgrade the rest of the osds # use this opportunity to check we can set osd flags properly ceph orch upgrade status ceph health detail # make sure noout is listed as a flag to be set as that is what we'll test with ceph config get mgr mgr/cephadm/upgrade_osd_flags | grep noout # upgrade osds and crash daemons. ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types crash,osd # wait for upgrade to be started and in progress to check for osd flags # To test noout being set during upgrade, want to loop here until either the upgrade completes, # fails with an error, or noout is set, but in the noout case, we need to do something to mark # that that was the condition that we stopped looping on. Doing that here by having # it create a file whose existence we can check for once the loop is over while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do if ceph osd dump -f json | jq '.flags_set' | grep noout; then touch saw_noout.txt; break; else echo "no noout yet"; fi; sleep 1; done ls | grep saw_noout # wait for upgrade to complete while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify noout was unset once upgrade completed if ceph osd dump -f json | jq '.flags_set' | grep noout; then (exit 1); else (exit 0); fi # verify all 8 osds are on the new container image id matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="osd") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "8" ]; then echo "Found unexpected number of upgraded osd daemons"; exit 1; else echo "Matched 8 osd with new container image id"; fi ceph orch upgrade status ceph health detail # upgrade the rgw daemons using --services ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --services rgw.foo while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done ceph orch ps # verify all 2 rgw daemons were upgraded matching_daemon_count=$(ceph orch ps --format json | jq --arg TARGET_ID "$TARGET_ID" -e '.[] | select(.daemon_type=="rgw") | select(.container_image_id==$TARGET_ID)' | grep "container_image_id" | wc -l) if [ "$matching_daemon_count" != "2" ]; then echo "Found unexpected number of upgraded rgw daemons"; exit 1; else echo "Matched 2 rgw with new container image id"; fi ceph orch upgrade status ceph health detail # run upgrade one more time with no filter parameters to make sure anything left gets upgraded ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
    • cephadm.shell:
      • mon.a:
        • while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; ceph health detail ; sleep 30 ; done
        • ceph orch ps
        • ceph versions
        • echo "wait for servicemap items w/ changing names to refresh"
        • sleep 60
        • ceph orch ps
        • ceph versions
        • ceph orch upgrade status
        • ceph health detail
        • ceph versions | jq -e '.overall | length == 1'
        • ceph versions | jq -e '.overall | keys' | grep $sha1
        • ceph orch ls | grep '^osd '
    • cephadm.shell:
      • mon.a:
        • ceph orch upgrade ls
        • ceph orch upgrade ls --image quay.io/ceph/ceph --show-all-versions | grep 16.2.0
        • ceph orch upgrade ls --image quay.io/ceph/ceph --tags | grep v16.2.2
  • teuthology_branch: main
  • verbose: False
  • pcp_grafana_url:
  • priority:
  • user:
  • queue:
  • posted: 2023-05-28 15:21:07
  • started: 2023-05-29 00:23:44
  • updated: 2023-05-29 01:17:16
  • status_class: success
  • runtime: 0:53:32
  • wait_time: 0:09:44