Nova virtual machine saves custom image code analysis

Recently, the company’s nova virtual machine failed to save a custom image, so let’s sort out the whole process from the code side

Save custom image to virtual machine

#nova image-create {<!-- -->{INSTANCE_UUID}} {<!-- -->{NEW_IMAGE_NAME}}

nova/api/openstack/compute/servers.py

Request api’s _action_create_image, query instance and bdms here, because enable_snapshot_volume_backed is not turned on, so self.compute_api.snapshot is executed

@wsgi.response(202)
    @wsgi.expected_errors((400, 403, 404, 409))
    @wsgi.action('createImage')
    @validation.schema(schema_servers.create_image, '2.0', '2.0')
    @validation.schema(schema_servers.create_image, '2.1')
    def _action_create_image(self, req, id, body):
        """Snapshot a server instance."""
        context = req.environ['nova.context']
        context.can(server_policies.SERVERS % 'create_image')

        entity = body["createImage"]
        image_name = common.normalize_name(entity["name"])
        metadata = entity.get('metadata', {})

        # Starting from microversion 2.39 we don't check quotas on createImage
        if api_version_request.is_supported(
                req, max_version=
                api_version_request.MAX_IMAGE_META_PROXY_API_VERSION):
            common.check_img_metadata_properties_quota(context, metadata)

        instance = self._get_server(context, req, id)

        bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                    context, instance.uuid)

        try:
            if CONF.api.enable_snapshot_volume_backed and \
                compute_utils.is_volume_backed_instance(context, instance,
                                                        bdms):
                context.can(server_policies.SERVERS %
                    'create_image:allow_volume_backed')
                image = self.compute_api.snapshot_volume_backed(
                                                       context,
                                                       instance,
                                                       image_name,
                                                       extra_properties=
                                                       metadata)
            else:
                image = self.compute_api.snapshot(context,
                                                  instance,
                                                  image_name,
                                                  extra_properties=metadata)
        except exception.InstanceUnknownCell as e:
            raise exc.HTTPNotFound(explanation=e.format_message())
        except exception.InstanceInvalidState as state_error:
            common.raise_http_conflict_for_instance_invalid_state(state_error,
                        'createImage', id)
        except exception.Invalid as err:
            raise exc.HTTPBadRequest(explanation=err.format_message())
        except exception.OverQuota as e:
            raise exc.HTTPForbidden(explanation=e.format_message())

        # Starting with microversion 2.45 we return a response body containing
        # the snapshot image id without the Location header.
        if api_version_request.is_supported(req, '2.45'):
            return {'image_id': image['id']}

        # build location of newly-created image entity
        image_id = str(image['id'])
        image_ref = image_api.API().generate_image_url(image_id, context)

        resp = webob.Response(status_int=202)
        resp.headers['Location'] = image_ref
        return resp

nova/compute/api.py

Call compute_utils.create_image, first adjust glance to create the image, create an action record for the instance, and finally send the rpc request snapshot_instance with the parameters virtual machine uuid and image uuid.

 def snapshot(self, context, instance, name, extra_properties=None):
        """Snapshot the given instance.

        :param instance: nova.objects.instance.Instance object
        :param name: name of the snapshot
        :param extra_properties: dict of extra image properties to include
                                 when creating the image.
        :returns: A dict containing image metadata
        """
        image_meta = compute_utils.create_image(
            context, instance, name, 'snapshot', self.image_api,
            extra_properties=extra_properties)

        # NOTE(comstud): Any changes to this method should also be made
        # to the snapshot_instance() method in nova/cells/messaging.py
        instance.task_state = task_states.IMAGE_SNAPSHOT_PENDING
        try:
            instance.save(expected_task_state=[None])
        except (exception.InstanceNotFound,
                exception.UnexpectedDeletingTaskStateError) as ex:
            # Changing the instance task state to use in raising the
            #InstanceInvalidException below
            LOG.debug('Instance disappeared during snapshot.',
                      instance=instance)
            try:
                image_id = image_meta['id']
                self.image_api.delete(context, image_id)
                LOG.info('Image %s deleted because instance '
                         'deleted before snapshot started.',
                         image_id, instance=instance)
            except exception.ImageNotFound:
                pass
            except Exception as exc:
                LOG.warning("Error while trying to clean up image %(img_id)s: "
                            "%(error_msg)s",
                            {"img_id": image_meta['id'],
                             "error_msg": six.text_type(exc)})
            attr = 'task_state'
            state = task_states.DELETING
            if type(ex) == exception.InstanceNotFound:
                attr = 'vm_state'
                state = vm_states.DELETED
            raise exception.InstanceInvalidState(attr=attr,
                                           instance_uuid=instance.uuid,
                                           state=state,
                                           method='snapshot')

        self._record_action_start(context, instance,
                                  instance_actions.CREATE_IMAGE)

        self.compute_rpcapi.snapshot_instance(context, instance,
                                              image_meta['id'])

        return image_meta

nova/compute/manager.py

Change the state of the virtual machine and finally call self.driver.snapshot to take a snapshot

 def snapshot_instance(self, context, image_id, instance):
        """Snapshot an instance on this host.

        :param context: security context
        :param image_id: glance.db.sqlalchemy.models.Image.Id
        :param instance: a nova.objects.instance.Instance object
        """
        # NOTE(dave-mcnally) the task state will already be set by the api
        # but if the compute manager has crashed/been restarted prior to the
        # request getting here the task state may have been cleared so we set
        # it again and things continue normally
        try:
            instance.task_state = task_states.IMAGE_SNAPSHOT
            instance.save(
                        expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
        except exception.InstanceNotFound:
            # possibility instance no longer exists, no point in continuing
            LOG.debug("Instance not found, could not set state %s "
                      "for instance.",
                      task_states.IMAGE_SNAPSHOT, instance=instance)
            return

        except exception.UnexpectedDeletingTaskStateError:
            LOG.debug("Instance being deleted, snapshot cannot continue",
                      instance=instance)
            return

        self._snapshot_instance(context, image_id, instance,
                                task_states.IMAGE_SNAPSHOT)

 def _snapshot_instance(self, context, image_id, instance,
                           expected_task_state):
        context = context.elevated()

        instance.power_state = self._get_power_state(context, instance)
        try:
            instance.save()

            LOG.info('instance snapshotting', instance=instance)

            if instance.power_state != power_state.RUNNING:
                state = instance.power_state
                running = power_state.RUNNING
                LOG.warning('trying to snapshot a non-running instance: '
                            '(state: %(state)s expected: %(running)s)',
                            {'state': state, 'running': running},
                            instance=instance)

            self._notify_about_instance_usage(
                context, instance, "snapshot.start")
            compute_utils.notify_about_instance_snapshot(context, instance,
                self.host, phase=fields.NotificationPhase.START,
                snapshot_image_id=image_id)

            def update_task_state(task_state,
                                  expected_state=expected_task_state):
                instance.task_state = task_state
                instance.save(expected_task_state=expected_state)

            with timeutils.StopWatch() as timer:
                self.driver.snapshot(context, instance, image_id,
                                     update_task_state)
            LOG.info('Took %0.2f seconds to snapshot the instance on '
                     'the hypervisor.', timer.elapsed(), instance=instance)

            instance.task_state = None
            instance.save(expected_task_state=task_states.IMAGE_UPLOADING)

            self._notify_about_instance_usage(context, instance,
                                              "snapshot.end")
            compute_utils.notify_about_instance_snapshot(context, instance,
                self.host, phase=fields.NotificationPhase.END,
                snapshot_image_id=image_id)
        except (exception.InstanceNotFound,
                exception.UnexpectedDeletingTaskStateError):
            # the instance got deleted during the snapshot
            #Quickly bail out of here
            msg = 'Instance disappeared during snapshot'
            LOG.debug(msg, instance=instance)
            try:
                image = self.image_api.get(context, image_id)
                if image['status'] != 'active':
                    self.image_api.delete(context, image_id)
            except exception.ImageNotFound:
                LOG.debug('Image not found during clean up %s', image_id)
            exceptException:
                LOG.warning("Error while trying to clean up image %s",
                            image_id, instance=instance)
        except exception.ImageNotFound:
            instance.task_state = None
            instance.save()
            LOG.warning("Image not found during snapshot", instance=instance)

nova/virt/libvirt/driver.py

The snapshot function of libvirt is called. This block of code is relatively complex. Finally, root_disk.direct_snapshot is called to take a snapshot of the ceph block to obtain the backend url. At the same time, the image api is called to update the location address.

 def snapshot(self, context, instance, image_id, update_task_state):
        """Create snapshot from a running VM instance.

        This command only works with qemu 0.14 +
        """
        try:
            guest = self._host.get_guest(instance)

            # TODO(sahid): We are converting all calls from a
            # virDomain object to use nova.virt.libvirt.Guest.
            # We should be able to remove virt_dom at the end.
            virt_dom = guest._domain
        except exception.InstanceNotFound:
            raise exception.InstanceNotRunning(instance_id=instance.uuid)

        snapshot = self._image_api.get(context, image_id)

        # source_format is an on-disk format
        # source_type is a backend type
        disk_path, source_format = libvirt_utils.find_disk(guest)
        source_type = libvirt_utils.get_disk_type_from_path(disk_path)

        # We won't have source_type for raw or qcow2 disks, because we can't
        # determine that from the path. We should have it from the libvirt
        # xml, though.
        if source_type is None:
            source_type = source_format
        # For lxc instances we won't have it either from libvirt xml
        # (because we just gave libvirt the mounted filesystem), or the path,
        # so source_type is still going to be None. In this case,
        # root_disk is going to default to CONF.libvirt.images_type
        # below, which is still safe.

        image_format = CONF.libvirt.snapshot_image_format or source_type

        # NOTE(bfilippov): save lvm and rbd as raw
        if image_format == 'lvm' or image_format == 'rbd':
            image_format = 'raw'

        metadata = self._create_snapshot_metadata(instance.image_meta,
                                                  instance,
                                                  image_format,
                                                  snapshot['name'])

        snapshot_name = uuidutils.generate_uuid(dashed=False)

        state = guest.get_power_state(self._host)

        # NOTE(dgenin): Instances with LVM encrypted ephemeral storage require
        # cold snapshots. Currently, checking for encryption is
        # redundant because LVM supports only cold snapshots.
        # It is necessary in case this situation changes in the
        # future.
        if (self._host.has_min_version(hv_type=host.HV_DRIVER_QEMU)
                and source_type not in ('lvm', 'qcow2')
                and not CONF.ephemeral_storage_encryption.enabled
                and not CONF.workarounds.disable_libvirt_livesnapshot
                # NOTE(rmk): We cannot perform live snapshots when a
                # managedSave file is present, so we will use the cold/legacy
                # method for instances which are shutdown or paused.
                # NOTE(mriedem): Live snapshot doesn't work with paused
                # instances on older versions of libvirt/qemu. We can likely
                # remove the restriction on PAUSED once we require
                # libvirt>=3.6.0 and qemu>=2.10 since that works with the
                # Pike Ubuntu Cloud Archive testing in Queens.
                and state not in (power_state.SHUTDOWN, power_state.PAUSED)):
            live_snapshot = True
            # Abort is an idempotent operation, so make sure any block
            # jobs which may have failed are ended. This operation also
            # confirms the running instance, as opposed to the system as a
            # whole, has a new enough version of the hypervisor (bug 1193146).
            try:
                guest.get_block_device(disk_path).abort_job()
            except libvirt.libvirtError as ex:
                error_code = ex.get_error_code()
                if error_code == libvirt.VIR_ERR_CONFIG_UNSUPPORTED:
                    live_snapshot = False
                else:
                    pass
        else:
            live_snapshot = False

        self._prepare_domain_for_snapshot(context, live_snapshot, state,
                                          instance)

        ceph_conf = dict()
        if source_type == "rbd":
            bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                context, instance.uuid)
            for bdm in bdms:
                if bdm.get('boot_index', -1) == 0 and bdm.destination_type == 'volume':
                    connection_info = jsonutils.loads(bdm.connection_info)
                    ceph_conf["hosts"] = connection_info["data"]["hosts"]
                    ceph_conf["auth_username"] = connection_info["data"]["auth_username"]
                    LOG.info("Get ceph conf {ceph_conf}".format(ceph_conf=ceph_conf))
                    instance.ceph_conf = ceph_conf
                    volume = self._volume_api.get(context, bdm['volume_id'])
                    if volume.has_key('volume_image_metadata') and \
                            volume['volume_image_metadata'].get("stores", None):
                        metadata["backend"] = volume['volume_image_metadata']['stores']
                    break

                if bdm.get('boot_index', -1) == 0 and bdm.destination_type == 'local':
                    if CONF.libvirt.glance_backend_config:
                        metadata["backend"] = CONF.libvirt.glance_backend_config

        root_disk = self.image_backend.by_libvirt_path(
            instance, disk_path, image_type=source_type)

        LOG.info("Display image information, such as: instance is {instance}, disk_path is {disk_path}, source_format is {source_format}, source_type is {source_type}, image_format is {image_format}".format(
            instance=instance, disk_path=disk_path, source_format=source_format, source_type=source_type, image_format=image_format))

        if live_snapshot:
            LOG.info("Beginning live snapshot process", instance=instance)
        else:
            LOG.info("Beginning cold snapshot process", instance=instance)

        update_task_state(task_state=task_states.IMAGE_PENDING_UPLOAD)

        update_task_state(task_state=task_states.IMAGE_UPLOADING,
                          expected_state=task_states.IMAGE_PENDING_UPLOAD)

        try:
            metadata['location'] = root_disk.direct_snapshot(
                context, snapshot_name, image_format, image_id,
                instance.image_ref)
            if CONF.enable_verify_image_md5sum:
                metadata['properties']['md5sum'] = \
                        root_disk.calc_image_md5sum(metadata['location'])
            self._snapshot_domain(context, live_snapshot, virt_dom, state,
                                  instance)
            self._image_api.update(context, image_id, metadata,
                                   purge_props=False)
        except (NotImplementedError, exception.ImageUnacceptable,
                exception.Forbidden) as e:
            if type(e) != NotImplementedError:
                LOG.warning('Performing standard snapshot because direct '
                            'snapshot failed: %(error)s',
                            {'error': encodeutils.exception_to_unicode(e)})
            failed_snap = metadata.pop('location', None)
            if failed_snap:
                failed_snap = {'url': str(failed_snap)}
            root_disk.cleanup_direct_snapshot(failed_snap,
                                              also_destroy_volume=True,
                                              ignore_errors=True)
            update_task_state(task_state=task_states.IMAGE_PENDING_UPLOAD,
                              expected_state=task_states.IMAGE_UPLOADING)

            # TODO(nic): possibly abstract this out to the root_disk
            if source_type == 'rbd' and live_snapshot:
                # Standard snapshot uses qemu-img convert from RBD which is
                # not safe to run with live_snapshot.
                live_snapshot = False
                # Suspend the guest, so this is no longer a live snapshot
                self._prepare_domain_for_snapshot(context, live_snapshot,
                                                  state, instance)

            snapshot_directory = CONF.libvirt.snapshots_directory
            fileutils.ensure_tree(snapshot_directory)
            with utils.tempdir(dir=snapshot_directory) as tmpdir:
                try:
                    out_path = os.path.join(tmpdir, snapshot_name)
                    if live_snapshot:
                        # NOTE(xqueralt): libvirt needs o + x in the tempdir
                        os.chmod(tmpdir, 0o701)
                        self._live_snapshot(context, instance, guest,
                                            disk_path, out_path, source_format,
                                            image_format, instance.image_meta)
                        if source_format == 'qcow2' and \
                                CONF.libvirt.convert_ceph_raw_for_qcow2:
                            snapshot_path = out_path + '.delta'
                            # use qemu-img convert to convert qcow2 img to ceph raw
                            metadata['location'] = root_disk.convert_to_ceph(
                                                      context, snapshot_path,
                                                      image_format, image_id)
                            if CONF.enable_verify_image_md5sum:
                                metadata['properties']['md5sum'] = \
                                        root_disk.calc_image_md5sum(
                                                metadata['location'])
                    else:
                        if source_format == 'qcow2' and \
                                CONF.libvirt.convert_ceph_raw_for_qcow2:
                            metadata['location'] = root_disk.convert_to_ceph(
                                                      context, None,
                                                      image_format, image_id)
                            if CONF.enable_verify_image_md5sum:
                                metadata['properties']['md5sum'] = \
                                        root_disk.calc_image_md5sum(
                                                metadata['location'])
                        else:
                            root_disk.snapshot_extract(out_path, image_format)
                    if not CONF.libvirt.convert_ceph_raw_for_qcow2:
                        LOG.info("Snapshot extracted, beginning image upload",
                             instance=instance)
                except libvirt.libvirtError as ex:
                    error_code = ex.get_error_code()
                    if error_code == libvirt.VIR_ERR_NO_DOMAIN:
                        LOG.info('Instance %(instance_name)s disappeared '
                                 'while taking snapshot of it: [Error Code '
                                 '%(error_code)s] %(ex)s',
                                 {'instance_name': instance.name,
                                  'error_code': error_code,
                                  'ex': ex},
                                 instance=instance)
                        raise exception.InstanceNotFound(
                            instance_id=instance.uuid)
                    else:
                        raise
                finally:
                    self._snapshot_domain(context, live_snapshot, virt_dom,
                                          state, instance)

                update_task_state(task_state=task_states.IMAGE_UPLOADING,
                        expected_state=task_states.IMAGE_PENDING_UPLOAD)
                if source_format == 'qcow2' and \
                        CONF.libvirt.convert_ceph_raw_for_qcow2:
                    self._image_api.update(context, image_id, metadata,
                                           purge_props=False)
                else:
                    # Upload that image to the image service
                    LOG.debug('upload to glance, out_path:%s', out_path)
                    with libvirt_utils.file_open(out_path, 'rb') as image_file:
                        # execute operation with disk concurrency semaphore
                        with compute_utils.disk_ops_semaphore:
                            self._image_api.update(context,
                                                   image_id,
                                                   metadata,
                                                   image_file)
        exceptException:
            with excutils.save_and_reraise_exception():
                LOG.exception(_("Failed to snapshot image"))
                failed_snap = metadata.pop('location', None)
                if failed_snap:
                    failed_snap = {'url': str(failed_snap)}
                root_disk.cleanup_direct_snapshot(
                        failed_snap, also_destroy_volume=True,
                        ignore_errors=True)

        LOG.info("Snapshot image upload complete", instance=instance)

 def direct_snapshot(self, context, snapshot_name, image_format,
                        image_id, base_image_id):
        """Creates an RBD snapshot directly.
        """
        fsid = self.driver.get_fsid()
        # NOTE(nic): Nova has zero comprehension of how Glance's image store
        # is configured, but we can infer what storage pool Glance is using
        # by looking at the parent image. If using authx, write access should
        # be enabled on that pool for the Nova user

        #parent_pool = self._get_parent_pool(context, base_image_id, fsid)
        # use instance pool to save image
        parent_pool = self.pool
        LOG.debug('self.path:%s, self.pool:%s', self.path, self.pool)
        # Snapshot the disk and clone it into Glance's storage pool. librbd
        # requires that snapshots be set to "protected" in order to clone them
        self.driver.create_snap(self.rbd_name, snapshot_name, protect=True)
        location = {'url': 'rbd://%(fsid)s/%(pool)s/%(image)s/%(snap)s' %
                           dict(fsid=fsid,
                                pool=self.pool,
                                image=self.rbd_name,
                                snap=snapshot_name)}
        try:
            self.driver.clone(location, image_id, dest_pool=parent_pool)
            # Flatten the image, which detaches it from the source snapshot
            self.driver.flatten(image_id, pool=parent_pool)
        finally:
            # all done with the source snapshot, clean it up
            self.cleanup_direct_snapshot(location)

        # Glance makes a protected snapshot called 'snap' on uploaded
        # images and hands it out, so we'll do that too. The name of
        # the snapshot doesn't really matter, this just uses what the
        # glance-store rbd backend sets (which is not configurable).
        self.driver.create_snap(image_id, 'snap', pool=parent_pool,
                                protect=True)
        return ('rbd://%(fsid)s/%(pool)s/%(image)s/snap' %
                dict(fsid=fsid, pool=parent_pool, image=image_id))