Recently, the company’s nova virtual machine failed to save a custom image, so let’s sort out the whole process from the code side
- Save custom image to virtual machine
#nova image-create {<!-- -->{INSTANCE_UUID}} {<!-- -->{NEW_IMAGE_NAME}}
- nova/api/openstack/compute/servers.py
Request api’s _action_create_image, query instance and bdms here, because enable_snapshot_volume_backed is not turned on, so self.compute_api.snapshot is executed
@wsgi.response(202) @wsgi.expected_errors((400, 403, 404, 409)) @wsgi.action('createImage') @validation.schema(schema_servers.create_image, '2.0', '2.0') @validation.schema(schema_servers.create_image, '2.1') def _action_create_image(self, req, id, body): """Snapshot a server instance.""" context = req.environ['nova.context'] context.can(server_policies.SERVERS % 'create_image') entity = body["createImage"] image_name = common.normalize_name(entity["name"]) metadata = entity.get('metadata', {}) # Starting from microversion 2.39 we don't check quotas on createImage if api_version_request.is_supported( req, max_version= api_version_request.MAX_IMAGE_META_PROXY_API_VERSION): common.check_img_metadata_properties_quota(context, metadata) instance = self._get_server(context, req, id) bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) try: if CONF.api.enable_snapshot_volume_backed and \ compute_utils.is_volume_backed_instance(context, instance, bdms): context.can(server_policies.SERVERS % 'create_image:allow_volume_backed') image = self.compute_api.snapshot_volume_backed( context, instance, image_name, extra_properties= metadata) else: image = self.compute_api.snapshot(context, instance, image_name, extra_properties=metadata) except exception.InstanceUnknownCell as e: raise exc.HTTPNotFound(explanation=e.format_message()) except exception.InstanceInvalidState as state_error: common.raise_http_conflict_for_instance_invalid_state(state_error, 'createImage', id) except exception.Invalid as err: raise exc.HTTPBadRequest(explanation=err.format_message()) except exception.OverQuota as e: raise exc.HTTPForbidden(explanation=e.format_message()) # Starting with microversion 2.45 we return a response body containing # the snapshot image id without the Location header. if api_version_request.is_supported(req, '2.45'): return {'image_id': image['id']} # build location of newly-created image entity image_id = str(image['id']) image_ref = image_api.API().generate_image_url(image_id, context) resp = webob.Response(status_int=202) resp.headers['Location'] = image_ref return resp
- nova/compute/api.py
Call compute_utils.create_image, first adjust glance to create the image, create an action record for the instance, and finally send the rpc request snapshot_instance with the parameters virtual machine uuid and image uuid.
def snapshot(self, context, instance, name, extra_properties=None): """Snapshot the given instance. :param instance: nova.objects.instance.Instance object :param name: name of the snapshot :param extra_properties: dict of extra image properties to include when creating the image. :returns: A dict containing image metadata """ image_meta = compute_utils.create_image( context, instance, name, 'snapshot', self.image_api, extra_properties=extra_properties) # NOTE(comstud): Any changes to this method should also be made # to the snapshot_instance() method in nova/cells/messaging.py instance.task_state = task_states.IMAGE_SNAPSHOT_PENDING try: instance.save(expected_task_state=[None]) except (exception.InstanceNotFound, exception.UnexpectedDeletingTaskStateError) as ex: # Changing the instance task state to use in raising the #InstanceInvalidException below LOG.debug('Instance disappeared during snapshot.', instance=instance) try: image_id = image_meta['id'] self.image_api.delete(context, image_id) LOG.info('Image %s deleted because instance ' 'deleted before snapshot started.', image_id, instance=instance) except exception.ImageNotFound: pass except Exception as exc: LOG.warning("Error while trying to clean up image %(img_id)s: " "%(error_msg)s", {"img_id": image_meta['id'], "error_msg": six.text_type(exc)}) attr = 'task_state' state = task_states.DELETING if type(ex) == exception.InstanceNotFound: attr = 'vm_state' state = vm_states.DELETED raise exception.InstanceInvalidState(attr=attr, instance_uuid=instance.uuid, state=state, method='snapshot') self._record_action_start(context, instance, instance_actions.CREATE_IMAGE) self.compute_rpcapi.snapshot_instance(context, instance, image_meta['id']) return image_meta
- nova/compute/manager.py
Change the state of the virtual machine and finally call self.driver.snapshot to take a snapshot
def snapshot_instance(self, context, image_id, instance): """Snapshot an instance on this host. :param context: security context :param image_id: glance.db.sqlalchemy.models.Image.Id :param instance: a nova.objects.instance.Instance object """ # NOTE(dave-mcnally) the task state will already be set by the api # but if the compute manager has crashed/been restarted prior to the # request getting here the task state may have been cleared so we set # it again and things continue normally try: instance.task_state = task_states.IMAGE_SNAPSHOT instance.save( expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING) except exception.InstanceNotFound: # possibility instance no longer exists, no point in continuing LOG.debug("Instance not found, could not set state %s " "for instance.", task_states.IMAGE_SNAPSHOT, instance=instance) return except exception.UnexpectedDeletingTaskStateError: LOG.debug("Instance being deleted, snapshot cannot continue", instance=instance) return self._snapshot_instance(context, image_id, instance, task_states.IMAGE_SNAPSHOT)
def _snapshot_instance(self, context, image_id, instance, expected_task_state): context = context.elevated() instance.power_state = self._get_power_state(context, instance) try: instance.save() LOG.info('instance snapshotting', instance=instance) if instance.power_state != power_state.RUNNING: state = instance.power_state running = power_state.RUNNING LOG.warning('trying to snapshot a non-running instance: ' '(state: %(state)s expected: %(running)s)', {'state': state, 'running': running}, instance=instance) self._notify_about_instance_usage( context, instance, "snapshot.start") compute_utils.notify_about_instance_snapshot(context, instance, self.host, phase=fields.NotificationPhase.START, snapshot_image_id=image_id) def update_task_state(task_state, expected_state=expected_task_state): instance.task_state = task_state instance.save(expected_task_state=expected_state) with timeutils.StopWatch() as timer: self.driver.snapshot(context, instance, image_id, update_task_state) LOG.info('Took %0.2f seconds to snapshot the instance on ' 'the hypervisor.', timer.elapsed(), instance=instance) instance.task_state = None instance.save(expected_task_state=task_states.IMAGE_UPLOADING) self._notify_about_instance_usage(context, instance, "snapshot.end") compute_utils.notify_about_instance_snapshot(context, instance, self.host, phase=fields.NotificationPhase.END, snapshot_image_id=image_id) except (exception.InstanceNotFound, exception.UnexpectedDeletingTaskStateError): # the instance got deleted during the snapshot #Quickly bail out of here msg = 'Instance disappeared during snapshot' LOG.debug(msg, instance=instance) try: image = self.image_api.get(context, image_id) if image['status'] != 'active': self.image_api.delete(context, image_id) except exception.ImageNotFound: LOG.debug('Image not found during clean up %s', image_id) exceptException: LOG.warning("Error while trying to clean up image %s", image_id, instance=instance) except exception.ImageNotFound: instance.task_state = None instance.save() LOG.warning("Image not found during snapshot", instance=instance)
- nova/virt/libvirt/driver.py
The snapshot function of libvirt is called. This block of code is relatively complex. Finally, root_disk.direct_snapshot is called to take a snapshot of the ceph block to obtain the backend url. At the same time, the image api is called to update the location address.
def snapshot(self, context, instance, image_id, update_task_state): """Create snapshot from a running VM instance. This command only works with qemu 0.14 + """ try: guest = self._host.get_guest(instance) # TODO(sahid): We are converting all calls from a # virDomain object to use nova.virt.libvirt.Guest. # We should be able to remove virt_dom at the end. virt_dom = guest._domain except exception.InstanceNotFound: raise exception.InstanceNotRunning(instance_id=instance.uuid) snapshot = self._image_api.get(context, image_id) # source_format is an on-disk format # source_type is a backend type disk_path, source_format = libvirt_utils.find_disk(guest) source_type = libvirt_utils.get_disk_type_from_path(disk_path) # We won't have source_type for raw or qcow2 disks, because we can't # determine that from the path. We should have it from the libvirt # xml, though. if source_type is None: source_type = source_format # For lxc instances we won't have it either from libvirt xml # (because we just gave libvirt the mounted filesystem), or the path, # so source_type is still going to be None. In this case, # root_disk is going to default to CONF.libvirt.images_type # below, which is still safe. image_format = CONF.libvirt.snapshot_image_format or source_type # NOTE(bfilippov): save lvm and rbd as raw if image_format == 'lvm' or image_format == 'rbd': image_format = 'raw' metadata = self._create_snapshot_metadata(instance.image_meta, instance, image_format, snapshot['name']) snapshot_name = uuidutils.generate_uuid(dashed=False) state = guest.get_power_state(self._host) # NOTE(dgenin): Instances with LVM encrypted ephemeral storage require # cold snapshots. Currently, checking for encryption is # redundant because LVM supports only cold snapshots. # It is necessary in case this situation changes in the # future. if (self._host.has_min_version(hv_type=host.HV_DRIVER_QEMU) and source_type not in ('lvm', 'qcow2') and not CONF.ephemeral_storage_encryption.enabled and not CONF.workarounds.disable_libvirt_livesnapshot # NOTE(rmk): We cannot perform live snapshots when a # managedSave file is present, so we will use the cold/legacy # method for instances which are shutdown or paused. # NOTE(mriedem): Live snapshot doesn't work with paused # instances on older versions of libvirt/qemu. We can likely # remove the restriction on PAUSED once we require # libvirt>=3.6.0 and qemu>=2.10 since that works with the # Pike Ubuntu Cloud Archive testing in Queens. and state not in (power_state.SHUTDOWN, power_state.PAUSED)): live_snapshot = True # Abort is an idempotent operation, so make sure any block # jobs which may have failed are ended. This operation also # confirms the running instance, as opposed to the system as a # whole, has a new enough version of the hypervisor (bug 1193146). try: guest.get_block_device(disk_path).abort_job() except libvirt.libvirtError as ex: error_code = ex.get_error_code() if error_code == libvirt.VIR_ERR_CONFIG_UNSUPPORTED: live_snapshot = False else: pass else: live_snapshot = False self._prepare_domain_for_snapshot(context, live_snapshot, state, instance) ceph_conf = dict() if source_type == "rbd": bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) for bdm in bdms: if bdm.get('boot_index', -1) == 0 and bdm.destination_type == 'volume': connection_info = jsonutils.loads(bdm.connection_info) ceph_conf["hosts"] = connection_info["data"]["hosts"] ceph_conf["auth_username"] = connection_info["data"]["auth_username"] LOG.info("Get ceph conf {ceph_conf}".format(ceph_conf=ceph_conf)) instance.ceph_conf = ceph_conf volume = self._volume_api.get(context, bdm['volume_id']) if volume.has_key('volume_image_metadata') and \ volume['volume_image_metadata'].get("stores", None): metadata["backend"] = volume['volume_image_metadata']['stores'] break if bdm.get('boot_index', -1) == 0 and bdm.destination_type == 'local': if CONF.libvirt.glance_backend_config: metadata["backend"] = CONF.libvirt.glance_backend_config root_disk = self.image_backend.by_libvirt_path( instance, disk_path, image_type=source_type) LOG.info("Display image information, such as: instance is {instance}, disk_path is {disk_path}, source_format is {source_format}, source_type is {source_type}, image_format is {image_format}".format( instance=instance, disk_path=disk_path, source_format=source_format, source_type=source_type, image_format=image_format)) if live_snapshot: LOG.info("Beginning live snapshot process", instance=instance) else: LOG.info("Beginning cold snapshot process", instance=instance) update_task_state(task_state=task_states.IMAGE_PENDING_UPLOAD) update_task_state(task_state=task_states.IMAGE_UPLOADING, expected_state=task_states.IMAGE_PENDING_UPLOAD) try: metadata['location'] = root_disk.direct_snapshot( context, snapshot_name, image_format, image_id, instance.image_ref) if CONF.enable_verify_image_md5sum: metadata['properties']['md5sum'] = \ root_disk.calc_image_md5sum(metadata['location']) self._snapshot_domain(context, live_snapshot, virt_dom, state, instance) self._image_api.update(context, image_id, metadata, purge_props=False) except (NotImplementedError, exception.ImageUnacceptable, exception.Forbidden) as e: if type(e) != NotImplementedError: LOG.warning('Performing standard snapshot because direct ' 'snapshot failed: %(error)s', {'error': encodeutils.exception_to_unicode(e)}) failed_snap = metadata.pop('location', None) if failed_snap: failed_snap = {'url': str(failed_snap)} root_disk.cleanup_direct_snapshot(failed_snap, also_destroy_volume=True, ignore_errors=True) update_task_state(task_state=task_states.IMAGE_PENDING_UPLOAD, expected_state=task_states.IMAGE_UPLOADING) # TODO(nic): possibly abstract this out to the root_disk if source_type == 'rbd' and live_snapshot: # Standard snapshot uses qemu-img convert from RBD which is # not safe to run with live_snapshot. live_snapshot = False # Suspend the guest, so this is no longer a live snapshot self._prepare_domain_for_snapshot(context, live_snapshot, state, instance) snapshot_directory = CONF.libvirt.snapshots_directory fileutils.ensure_tree(snapshot_directory) with utils.tempdir(dir=snapshot_directory) as tmpdir: try: out_path = os.path.join(tmpdir, snapshot_name) if live_snapshot: # NOTE(xqueralt): libvirt needs o + x in the tempdir os.chmod(tmpdir, 0o701) self._live_snapshot(context, instance, guest, disk_path, out_path, source_format, image_format, instance.image_meta) if source_format == 'qcow2' and \ CONF.libvirt.convert_ceph_raw_for_qcow2: snapshot_path = out_path + '.delta' # use qemu-img convert to convert qcow2 img to ceph raw metadata['location'] = root_disk.convert_to_ceph( context, snapshot_path, image_format, image_id) if CONF.enable_verify_image_md5sum: metadata['properties']['md5sum'] = \ root_disk.calc_image_md5sum( metadata['location']) else: if source_format == 'qcow2' and \ CONF.libvirt.convert_ceph_raw_for_qcow2: metadata['location'] = root_disk.convert_to_ceph( context, None, image_format, image_id) if CONF.enable_verify_image_md5sum: metadata['properties']['md5sum'] = \ root_disk.calc_image_md5sum( metadata['location']) else: root_disk.snapshot_extract(out_path, image_format) if not CONF.libvirt.convert_ceph_raw_for_qcow2: LOG.info("Snapshot extracted, beginning image upload", instance=instance) except libvirt.libvirtError as ex: error_code = ex.get_error_code() if error_code == libvirt.VIR_ERR_NO_DOMAIN: LOG.info('Instance %(instance_name)s disappeared ' 'while taking snapshot of it: [Error Code ' '%(error_code)s] %(ex)s', {'instance_name': instance.name, 'error_code': error_code, 'ex': ex}, instance=instance) raise exception.InstanceNotFound( instance_id=instance.uuid) else: raise finally: self._snapshot_domain(context, live_snapshot, virt_dom, state, instance) update_task_state(task_state=task_states.IMAGE_UPLOADING, expected_state=task_states.IMAGE_PENDING_UPLOAD) if source_format == 'qcow2' and \ CONF.libvirt.convert_ceph_raw_for_qcow2: self._image_api.update(context, image_id, metadata, purge_props=False) else: # Upload that image to the image service LOG.debug('upload to glance, out_path:%s', out_path) with libvirt_utils.file_open(out_path, 'rb') as image_file: # execute operation with disk concurrency semaphore with compute_utils.disk_ops_semaphore: self._image_api.update(context, image_id, metadata, image_file) exceptException: with excutils.save_and_reraise_exception(): LOG.exception(_("Failed to snapshot image")) failed_snap = metadata.pop('location', None) if failed_snap: failed_snap = {'url': str(failed_snap)} root_disk.cleanup_direct_snapshot( failed_snap, also_destroy_volume=True, ignore_errors=True) LOG.info("Snapshot image upload complete", instance=instance)
def direct_snapshot(self, context, snapshot_name, image_format, image_id, base_image_id): """Creates an RBD snapshot directly. """ fsid = self.driver.get_fsid() # NOTE(nic): Nova has zero comprehension of how Glance's image store # is configured, but we can infer what storage pool Glance is using # by looking at the parent image. If using authx, write access should # be enabled on that pool for the Nova user #parent_pool = self._get_parent_pool(context, base_image_id, fsid) # use instance pool to save image parent_pool = self.pool LOG.debug('self.path:%s, self.pool:%s', self.path, self.pool) # Snapshot the disk and clone it into Glance's storage pool. librbd # requires that snapshots be set to "protected" in order to clone them self.driver.create_snap(self.rbd_name, snapshot_name, protect=True) location = {'url': 'rbd://%(fsid)s/%(pool)s/%(image)s/%(snap)s' % dict(fsid=fsid, pool=self.pool, image=self.rbd_name, snap=snapshot_name)} try: self.driver.clone(location, image_id, dest_pool=parent_pool) # Flatten the image, which detaches it from the source snapshot self.driver.flatten(image_id, pool=parent_pool) finally: # all done with the source snapshot, clean it up self.cleanup_direct_snapshot(location) # Glance makes a protected snapshot called 'snap' on uploaded # images and hands it out, so we'll do that too. The name of # the snapshot doesn't really matter, this just uses what the # glance-store rbd backend sets (which is not configurable). self.driver.create_snap(image_id, 'snap', pool=parent_pool, protect=True) return ('rbd://%(fsid)s/%(pool)s/%(image)s/snap' % dict(fsid=fsid, pool=parent_pool, image=image_id))