cinder服务状态监控

Posted by xue on June 18, 2017

以kilo版本cinder为例

cinder ServiceController

class ServiceController(wsgi.Controller):
    @wsgi.serializers(xml=ServicesIndexTemplate)
    def index(self, req):
        """Return a list of all running services.
        Filter by host & service name.
        """
        context = req.environ['cinder.context']
        authorize(context)
        detailed = self.ext_mgr.is_loaded('os-extended-services')
        now = timeutils.utcnow() # 获取当前时间
        services = db.service_get_all(context) # 获取service 列表
        ...
        svcs = []
        for svc in services:
            updated_at = svc['updated_at']
            delta = now - (svc['updated_at'] or svc['created_at']) # 获取 updated_at,不存在的话,获取 created_at。并和当前时间计算时间差
            delta_sec = delta.total_seconds() # 转换成秒
            ...
            alive = abs(delta_sec) <= CONF.service_down_time # 检查是否小于配置的 server_down_time,该配置项默认是60秒
            art = (alive and "up") or "down" # 如果差值小于60,则service 状态为 up,否则为 down
            active = 'enabled'
            if svc['disabled']: # 如何从数据库查询到的service disabled != 0,则表示service 状态是disabled
                active = 'disabled'
            ret_fields = {'binary': svc['binary'], 'host': svc['host'],
                          'zone': svc['availability_zone'],
                          'status': active, 'state': art,
                          'updated_at': updated_at}
            if detailed:
                ret_fields['disabled_reason'] = svc['disabled_reason']
            svcs.append(ret_fields)
        return {'services': svcs}
    

可见,service返回结果状态是否为up,取决于now - updated_at 与CONF.service_down_time 谁大谁小。

cinder updated_at更新机制

接下来看看cinder db 中service表updated_at字段的更新机制

cinder-api,cinder-backup 等服务都是class Service(service.Service) 的一个实例,它的start方法如下:

# cinder/service.py

class Service(service.Service):
    def start(self):
        version_string = version.version_string()
        LOG.info(_LI('Starting %(topic)s node (version %(version_string)s)'),
                 {'topic': self.topic, 'version_string': version_string})
        self.model_disconnected = False
        self.manager.init_host()
        ctxt = context.get_admin_context()
        try:
            service_ref = db.service_get_by_args(ctxt,
                                                 self.host,
                                                 self.binary)
            self.service_id = service_ref['id']
        except exception.NotFound:
            self._create_service_ref(ctxt)

        LOG.debug("Creating RPC server for service %s", self.topic)

        target = messaging.Target(topic=self.topic, server=self.host)
        endpoints = [self.manager]
        endpoints.extend(self.manager.additional_endpoints)
        serializer = objects_base.CinderObjectSerializer()
        self.rpcserver = rpc.get_server(target, endpoints, serializer)
        self.rpcserver.start()

        self.manager.init_host_with_rpc()

        if self.report_interval:
            # 启动一个循环来执行 report_state 方法,运行间隔就是 report_interval,其默认值是 10 秒,report_state会更新数据库
            pulse = loopingcall.FixedIntervalLoopingCall(
                self.report_state)
            pulse.start(interval=self.report_interval,
                        initial_delay=self.report_interval)
            self.timers.append(pulse)

        if self.periodic_interval:
            if self.periodic_fuzzy_delay:
                initial_delay = random.randint(0, self.periodic_fuzzy_delay)
            else:
                initial_delay = None

            periodic = loopingcall.FixedIntervalLoopingCall(
                self.periodic_tasks)
            periodic.start(interval=self.periodic_interval,
                           initial_delay=initial_delay)
            self.timers.append(periodic)
            
    def report_state(self):
        """Update the state of this service in the datastore."""
        ctxt = context.get_admin_context()
        zone = CONF.storage_availability_zone
        state_catalog = {}
        try:
            try:
                service_ref = db.service_get(ctxt, self.service_id)
            except exception.NotFound:
                LOG.debug('The service database object disappeared, '
                          'recreating it.')
                self._create_service_ref(ctxt)
                service_ref = db.service_get(ctxt, self.service_id)

            state_catalog['report_count'] = service_ref['report_count'] + 1 # report_count的值会加1
            if zone != service_ref['availability_zone']:
                state_catalog['availability_zone'] = zone

            db.service_update(ctxt,
                              self.service_id, state_catalog) # //更新该 service,在更新的时候updated_at会自动根据当前时间更新

            # TODO(termie): make this pattern be more elegant.
            if getattr(self, 'model_disconnected', False):
                self.model_disconnected = False
                LOG.error(_LE('Recovered model server connection!'))

        except db_exc.DBConnectionError:
            if not getattr(self, 'model_disconnected', False):
                self.model_disconnected = True
                LOG.exception(_LE('model server went away'))

        # NOTE(jsbryant) Other DB errors can happen in HA configurations.
        # such errors shouldn't kill this thread, so we handle them here.
        except db_exc.DBError:
            if not getattr(self, 'model_disconnected', False):
                self.model_disconnected = True
                LOG.exception(_LE('DBError encountered: '))

            

再来看看循环的类FixedIntervalLoopingCall,上面的代码可以看到,FixedIntervalLoopingCall初始化的时候传入了self.report_state, 然后执行了start方法

# cinder/openstack/common/loopingcall.py
_ts = lambda: time.time()
class LoopingCallBase(object):
    def __init__(self, f=None, *args, **kw):
        self.args = args
        self.kw = kw
        self.f = f
        self._running = False
        self.done = None

    def stop(self):
        self._running = False

    def wait(self):
        return self.done.wait()


class FixedIntervalLoopingCall(LoopingCallBase):
    """A fixed interval looping call."""

    def start(self, interval, initial_delay=None):
        self._running = True
        done = event.Event()

        def _inner():
            if initial_delay:
                greenthread.sleep(initial_delay)

            try:
                while self._running:
                    start = _ts() # 记录更新数据库开始的时间
                    self.f(*self.args, **self.kw) # 根据传进来的相应函数,执行相应的方法
                    end = _ts() # 记录更新数据库结束的时间
                    if not self._running:
                        break
                    delay = end - start - interval
                    if delay > 0:
                        LOG.warn(_LW('task %(func_name)r run outlasted '
                                     'interval by %(delay).2f sec'),
                                 {'func_name': self.f, 'delay': delay})
                    greenthread.sleep(-delay if delay < 0 else 0)
            except LoopingCallDone as e:
                self.stop()
                done.send(e.retvalue)
            except Exception:
                LOG.exception(_LE('in fixed duration looping call'))
                done.send_exception(*sys.exc_info())
                return
            else:
                done.send(True)

        self.done = done

        greenthread.spawn_n(_inner) # 调用函数_inner的绿色线程
        return self.done

问题定位

如果发现某个服务的状态为down,在启动日志没有出错的情况下,可以按照下面的步骤进行定位:

(1)看看是不是在 cinder.conf 中 report_interval 配置项的值是多少,如果超过了 service_down_time 配置项默认的 60 秒,那么该service 的状态肯定就是 ‘down’ 了。

(2)看 service 所在节点的时间,它的时间和 controller 节点的时间误差必须在 [service_down_time - report_interval ] 之内,也就是在使用默认配置情况下,时间差必须在 50 秒之内。

(3)看看 service 的 log 文件中,确认 report_state 方法是不是都按时被调用了,不方便看的话,在代码中加个注释吧。

参考

世民谈云计算