ï»¿
Your IP : 216.73.216.220

Current Path : /usr/lib/python3.6/site-packages/azurelinuxagent/common/
Current File : //usr/lib/python3.6/site-packages/azurelinuxagent/common/cgroupstelemetry.py
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
import errno
import threading
from collections import namedtuple
from datetime import datetime as dt

from azurelinuxagent.common import logger
from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.logger import EVERY_SIX_HOURS
from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo

MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])

DELIM = " | "
DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"


class MetricsCategory(object):
    MEMORY_CATEGORY = "Memory"
    PROCESS_CATEGORY = "Process"


class MetricsCounter(object):
    PROCESSOR_PERCENT_TIME = "% Processor Time"
    TOTAL_MEM_USAGE = "Total Memory Usage"
    MAX_MEM_USAGE = "Max Memory Usage"
    MEM_USED_BY_PROCESS = "Memory Used by Process"


class CGroupsTelemetry(object):
    """
    """
    _tracked = []
    _cgroup_metrics = {}
    _rlock = threading.RLock()

    @staticmethod
    def get_process_info_summary(process_id):
        process_cmdline = DEFAULT_PROCESS_COMMANDLINE
        process_name = DEFAULT_PROCESS_NAME

        # The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to
        # exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the
        # details from those files.
        try:
            process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
        except Exception as e:
            logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))

        try:
            process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME
        except Exception as e:
            logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))

        return process_id + DELIM + process_name + DELIM + process_cmdline

    @staticmethod
    def _get_metrics_list(metric):
        return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),
                metric.first_poll_time(), metric.last_poll_time()]

    @staticmethod
    def _process_cgroup_metric(cgroup_metrics):
        memory_usage = cgroup_metrics.get_memory_metrics()
        max_memory_usage = cgroup_metrics.get_max_memory_metrics()
        cpu_usage = cgroup_metrics.get_cpu_metrics()
        memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()

        processed_extension = {}

        if cpu_usage.count() > 0:
            processed_extension["cpu"] = {"cur_cpu": CGroupsTelemetry._get_metrics_list(cpu_usage)}

        if memory_usage.count() > 0:
            if "memory" in processed_extension:
                processed_extension["memory"]["cur_mem"] = CGroupsTelemetry._get_metrics_list(memory_usage)
            else:
                processed_extension["memory"] = {"cur_mem": CGroupsTelemetry._get_metrics_list(memory_usage)}

        if max_memory_usage.count() > 0:
            if "memory" in processed_extension:
                processed_extension["memory"]["max_mem"] = CGroupsTelemetry._get_metrics_list(max_memory_usage)
            else:
                processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}

        for pid_process_memory in memory_usage_per_process:
            if "proc_statm_memory" in processed_extension:
                processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \
                    CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)
            else:
                processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline:
                    CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)}

        return processed_extension

    @staticmethod
    def track_cgroup(cgroup):
        """
        Adds the given item to the dictionary of tracked cgroups
        """
        if isinstance(cgroup, CpuCgroup):
            # set the current cpu usage
            cgroup.initialize_cpu_usage()

        with CGroupsTelemetry._rlock:
            if not CGroupsTelemetry.is_tracked(cgroup.path):
                CGroupsTelemetry._tracked.append(cgroup)
                logger.info("Started tracking new cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path))

    @staticmethod
    def is_tracked(path):
        """
        Returns true if the given item is in the list of tracked items
        O(n) operation. But limited to few cgroup objects we have.
        """
        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked:
                if path == cgroup.path:
                    return True

        return False

    @staticmethod
    def stop_tracking(cgroup):
        """
        Stop tracking the cgroups for the given name
        """
        with CGroupsTelemetry._rlock:
            CGroupsTelemetry._tracked.remove(cgroup)
            logger.info("Stopped tracking cgroup: {0}, path: {1}".format(cgroup.name, cgroup.path))

    @staticmethod
    def report_all_tracked():
        """
        The report_all_tracked's purpose is to collect the data from the tracked cgroups and process the metric into a
        data structure by _process_cgroup_metric. The perf metric is added into the data structure and returned to the
        caller.

        The report_all_tracked would be removed soon - in favor of sending report_metric directly, when polling the data
        from tracked groups.

        :return collected_metrics: dictionary of cgroups metrics.
        """
        collected_metrics = {}

        for name, cgroup_metrics in CGroupsTelemetry._cgroup_metrics.items():
            perf_metric = CGroupsTelemetry._process_cgroup_metric(cgroup_metrics)

            if perf_metric:
                collected_metrics[name] = perf_metric

            cgroup_metrics.clear()

        # Doing cleanup after the metrics have already been collected.
        for key in [key for key in CGroupsTelemetry._cgroup_metrics if
                    CGroupsTelemetry._cgroup_metrics[key].marked_for_delete]:
            del CGroupsTelemetry._cgroup_metrics[key]

        return collected_metrics

    @staticmethod
    def poll_all_tracked():
        metrics = []

        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked[:]:
                if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
                    CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()
                try:
                    if cgroup.controller == CGroupContollers.CPU:
                        current_cpu_usage = cgroup.get_cpu_usage()
                        CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage)
                        metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.
                                                   PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage))
                    elif cgroup.controller == CGroupContollers.MEMORY:
                        current_memory_usage = cgroup.get_memory_usage()
                        CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage)
                        metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
                                                   TOTAL_MEM_USAGE, cgroup.name, current_memory_usage))

                        max_memory_usage = cgroup.get_max_memory_usage()
                        CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage)
                        metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE,
                                                   cgroup.name, max_memory_usage))

                        pids = cgroup.get_tracked_processes()
                        for pid in pids:
                            try:
                                mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid)
                                metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
                                                           MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid),
                                                           mem_usage_from_procstatm))
                                CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory(
                                    CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm)
                            except Exception as e:
                                if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
                                    logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm "
                                                                            "for pid {0}. Error : {1}", pid, ustr(e))
                    else:
                        raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(
                            cgroup.controller, cgroup.name))
                except Exception as e:
                    # There can be scenarios when the CGroup has been deleted by the time we are fetching the values
                    # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
                    # every occurrences of such case as it would be very verbose. We do want to log all the other
                    # exceptions which could occur, which is why we do a periodic log for all the other errors.
                    if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
                        logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup '
                                                                '{0}. Error : {1}'.format(cgroup.name, ustr(e)))
                if not cgroup.is_active():
                    CGroupsTelemetry.stop_tracking(cgroup)
                    CGroupsTelemetry._cgroup_metrics[cgroup.name].marked_for_delete = True

        return metrics

    @staticmethod
    def prune_all_tracked():
        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked[:]:
                if not cgroup.is_active():
                    CGroupsTelemetry.stop_tracking(cgroup)

    @staticmethod
    def reset():
        with CGroupsTelemetry._rlock:
            CGroupsTelemetry._tracked *= 0  # emptying the list
            CGroupsTelemetry._cgroup_metrics = {}


class CgroupMetrics(object):
    def __init__(self):
        self._memory_usage = Metric()
        self._max_memory_usage = Metric()
        self._cpu_usage = Metric()
        self._proc_statm_mem = {}

        self.marked_for_delete = False

    def add_memory_usage(self, usage):
        if not self.marked_for_delete:
            self._memory_usage.append(usage)

    def add_max_memory_usage(self, usage):
        if not self.marked_for_delete:
            self._max_memory_usage.append(usage)

    def add_cpu_usage(self, usage):
        if not self.marked_for_delete:
            self._cpu_usage.append(usage)

    def add_proc_statm_memory(self, pid, usage):
        if not self.marked_for_delete:
            if pid not in self._proc_statm_mem:
                self._proc_statm_mem[pid] = Metric()
            self._proc_statm_mem[pid].append(usage)

    def get_memory_metrics(self):
        return self._memory_usage

    def get_max_memory_metrics(self):
        return self._max_memory_usage

    def get_cpu_metrics(self):
        return self._cpu_usage

    def get_proc_statm_memory_metrics(self):
        """
        :return: StatmMetricValue tuples of pid and metric
        """
        return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()]

    def clear(self):
        self._memory_usage.clear()
        self._max_memory_usage.clear()
        self._cpu_usage.clear()
        self._proc_statm_mem.clear()


class Metric(object):
    def __init__(self):
        self._data = []
        self._first_poll_time = None
        self._last_poll_time = None

    def append(self, data):
        if not self._first_poll_time:
            #  We only want to do it first time.
            self._first_poll_time = dt.utcnow()

        self._data.append(data)
        self._last_poll_time = dt.utcnow()

    def clear(self):
        self._first_poll_time = None
        self._last_poll_time = None
        self._data *= 0

    def average(self):
        return float(sum(self._data)) / float(len(self._data)) if self._data else None

    def max(self):
        return max(self._data) if self._data else None

    def min(self):
        return min(self._data) if self._data else None

    def median(self):
        data = sorted(self._data)
        l_len = len(data)
        if l_len < 1:
            return None
        if l_len % 2 == 0:
            return (data[int((l_len - 1) / 2)] + data[int((l_len + 1) / 2)]) / 2.0
        else:
            return data[int((l_len - 1) / 2)]

    def count(self):
        return len(self._data)

    def first_poll_time(self):
        return str(self._first_poll_time)

    def last_poll_time(self):
        return str(self._last_poll_time)