说起Kubernetes监控,绕不开的话题就是Prometheus。随着Prometheus的发展,已然成为Kubernetes监控体系中的标准。

本文主要介绍通过yaml部署清单来安装Prometheus。

部署Prometheus

包含组件:

  • prometheus
  • kube-state-metrics
  • node-exporter
  • pushgateway
  • alertmanager
  • grafana

prometheus组件.png

ns.yaml

apiVersion: v1
kind: Namespace
metadata:
  name: monitoring

svc.yaml

apiVersion: v1
kind: Service
metadata:
  name: kube-state-metrics
  namespace: monitoring
  labels:
    app.kubernetes.io/name: kube-state-metrics
  annotations:
    prometheus.io/scrape: 'true'
spec:
  type: "ClusterIP"
  ports:
  - name: "http"
    protocol: TCP
    port: 8080
    targetPort: 8080
  selector:
    app.kubernetes.io/name: kube-state-metrics

---
apiVersion: v1
kind: Service
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
  namespace: monitoring
spec:
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 9093
  selector:
    component: "alertmanager"
    app: prometheus
  sessionAffinity: None
  type: "ClusterIP"

---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: "true"

  labels:
    component: "node-exporter"
    app: prometheus
  name: prometheus-node-exporter
  namespace: monitoring
spec:
  clusterIP: None
  ports:
    - name: metrics
      port: 9100
      protocol: TCP
      targetPort: 9100
  selector:
    component: "node-exporter"
    app: prometheus
  type: "ClusterIP"
---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/probe: pushgateway
    
  labels:
    component: "pushgateway"
    app: prometheus
  name: prometheus-pushgateway
  namespace: monitoring
spec:
  ports:
    - name: http
      port: 9091
      protocol: TCP
      targetPort: 9091
  selector:
    component: "pushgateway"
    app: prometheus
  type: "ClusterIP"

---
apiVersion: v1
kind: Service
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
  namespace: monitoring
spec:
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 9090
  selector:
    component: "server"
    app: prometheus
  sessionAffinity: None
  type: "NodePort"

---
apiVersion: v1
kind: Service
metadata:
  labels:
    component: "grafana"
    app: prometheus
  name: prometheus-grafana
  namespace: monitoring
spec:
  ports:
    - name: http
      port: 3000
      protocol: TCP
  selector:
    component: "grafana"
    app: prometheus
  sessionAffinity: None
  type: "NodePort"

rbac.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    app.kubernetes.io/name: kube-state-metrics
  name: kube-state-metrics
  namespace: monitoring
imagePullSecrets:
  []
  
---
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
  namespace: monitoring
    
---
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    component: "node-exporter"
    app: prometheus
  name: prometheus-node-exporter
  namespace: monitoring
    
---
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    component: "pushgateway"
    app: prometheus
  name: prometheus-pushgateway
  namespace: monitoring
    
---
apiVersion: v1
kind: ServiceAccount
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
  namespace: monitoring

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  labels:
    app.kubernetes.io/name: kube-state-metrics
  name: kube-state-metrics
rules:
- apiGroups: ["certificates.k8s.io"]
  resources:
  - certificatesigningrequests
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - configmaps
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources:
  - cronjobs
  verbs: ["list", "watch"]
- apiGroups: ["extensions", "apps"]
  resources:
  - daemonsets
  verbs: ["list", "watch"]
- apiGroups: ["extensions", "apps"]
  resources:
  - deployments
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - endpoints
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources:
  - horizontalpodautoscalers
  verbs: ["list", "watch"]
- apiGroups: ["extensions", "networking.k8s.io"]
  resources:
  - ingresses
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources:
  - jobs
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - limitranges
  verbs: ["list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
  resources:
    - mutatingwebhookconfigurations
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - namespaces
  verbs: ["list", "watch"]
- apiGroups: ["networking.k8s.io"]
  resources:
  - networkpolicies
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - nodes
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - persistentvolumeclaims
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - persistentvolumes
  verbs: ["list", "watch"]
- apiGroups: ["policy"]
  resources:
    - poddisruptionbudgets
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - pods
  verbs: ["list", "watch"]
- apiGroups: ["extensions", "apps"]
  resources:
  - replicasets
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - replicationcontrollers
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - resourcequotas
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - secrets
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - services
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources:
  - statefulsets
  verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
  resources:
    - storageclasses
  verbs: ["list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
  resources:
    - validatingwebhookconfigurations
  verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
  resources:
    - volumeattachments
  verbs: ["list", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
rules:
  []

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  labels:
    component: "pushgateway"
    app: prometheus
  name: prometheus-pushgateway
rules:
  []

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
rules:
  - apiGroups:
      - ""
    resources:
      - nodes
      - nodes/proxy
      - nodes/metrics
      - services
      - endpoints
      - pods
      - ingresses
      - configmaps
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - "extensions"
      - "networking.k8s.io"
    resources:
      - ingresses/status
      - ingresses
    verbs:
      - get
      - list
      - watch
  - nonResourceURLs:
      - "/metrics"
    verbs:
      - get

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  labels:
    app.kubernetes.io/name: kube-state-metrics
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: monitoring

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
subjects:
  - kind: ServiceAccount
    name: prometheus-alertmanager
    namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-alertmanager

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  labels:
    component: "pushgateway"
    app: prometheus
  name: prometheus-pushgateway
subjects:
  - kind: ServiceAccount
    name: prometheus-pushgateway
    namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-pushgateway

---
apiVersion: rbac.authorization.k8s.io/v1
#如果是低版本,apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
subjects:
  - kind: ServiceAccount
    name: prometheus-server
    namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-server

pv.yaml

apiVersion: v1
kind: PersistentVolume
metadata:
  name: monitoring-pv1
  namespace: monitoring
spec:
  capacity:
    storage: 10Gi
  accessModes:
    - ReadWriteOnce
  nfs:
    path: /volume1/k8s/monitoring/pv1
    server: 192.168.72.2
---
apiVersion: v1
kind: PersistentVolume
metadata:
  name: monitoring-pv2
  namespace: monitoring
spec:
  capacity:
    storage: 10Gi
  accessModes:
    - ReadWriteOnce
  nfs:
    path: /volume1/k8s/monitoring/pv2
    server: 192.168.72.2
---
apiVersion: v1
kind: PersistentVolume
metadata:
  name: monitoring-pv3
  namespace: monitoring
spec:
  capacity:
    storage: 10Gi
  accessModes:
    - ReadWriteOnce
  nfs:
    path: /volume1/k8s/monitoring/pv3
    server: 192.168.72.2

pvc.yaml

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
  namespace: monitoring
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: "2Gi"
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
  namespace: monitoring
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: "8Gi"
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-grafana
  namespace: monitoring
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: "8Gi"

server-cm.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
  namespace: monitoring
data:
  alerting_rules.yml: |
    {}
    
  alerts: |
    {}
    
  prometheus.yml: |
    global:
      evaluation_interval: 1m
      scrape_interval: 1m
      scrape_timeout: 10s
    rule_files:
    - /etc/config/recording_rules.yml
    - /etc/config/alerting_rules.yml
    - /etc/config/rules
    - /etc/config/alerts
    scrape_configs:
    - job_name: prometheus
      static_configs:
      - targets:
        - localhost:9090
    - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      job_name: kubernetes-apiservers
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - action: keep
        regex: default;kubernetes;https
        source_labels:
        - __meta_kubernetes_namespace
        - __meta_kubernetes_service_name
        - __meta_kubernetes_endpoint_port_name
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        insecure_skip_verify: true
    - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      job_name: kubernetes-nodes
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - replacement: kubernetes.default.svc:443
        target_label: __address__
      - regex: (.+)
        replacement: /api/v1/nodes/$1/proxy/metrics
        source_labels:
        - __meta_kubernetes_node_name
        target_label: __metrics_path__
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        insecure_skip_verify: true
    - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      job_name: kubernetes-nodes-cadvisor
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - replacement: kubernetes.default.svc:443
        target_label: __address__
      - regex: (.+)
        replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
        source_labels:
        - __meta_kubernetes_node_name
        target_label: __metrics_path__
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        insecure_skip_verify: true
    - job_name: kubernetes-service-endpoints
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - action: keep
        regex: true
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_scrape
      - action: replace
        regex: (https?)
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_scheme
        target_label: __scheme__
      - action: replace
        regex: (.+)
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_path
        target_label: __metrics_path__
      - action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        source_labels:
        - __address__
        - __meta_kubernetes_service_annotation_prometheus_io_port
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - action: replace
        source_labels:
        - __meta_kubernetes_namespace
        target_label: kubernetes_namespace
      - action: replace
        source_labels:
        - __meta_kubernetes_service_name
        target_label: kubernetes_name
      - action: replace
        source_labels:
        - __meta_kubernetes_pod_node_name
        target_label: kubernetes_node
    - job_name: kubernetes-service-endpoints-slow
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - action: keep
        regex: true
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
      - action: replace
        regex: (https?)
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_scheme
        target_label: __scheme__
      - action: replace
        regex: (.+)
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_path
        target_label: __metrics_path__
      - action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        source_labels:
        - __address__
        - __meta_kubernetes_service_annotation_prometheus_io_port
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - action: replace
        source_labels:
        - __meta_kubernetes_namespace
        target_label: kubernetes_namespace
      - action: replace
        source_labels:
        - __meta_kubernetes_service_name
        target_label: kubernetes_name
      - action: replace
        source_labels:
        - __meta_kubernetes_pod_node_name
        target_label: kubernetes_node
      scrape_interval: 5m
      scrape_timeout: 30s
    - honor_labels: true
      job_name: prometheus-pushgateway
      kubernetes_sd_configs:
      - role: service
      relabel_configs:
      - action: keep
        regex: pushgateway
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_probe
    - job_name: kubernetes-services
      kubernetes_sd_configs:
      - role: service
      metrics_path: /probe
      params:
        module:
        - http_2xx
      relabel_configs:
      - action: keep
        regex: true
        source_labels:
        - __meta_kubernetes_service_annotation_prometheus_io_probe
      - source_labels:
        - __address__
        target_label: __param_target
      - replacement: blackbox
        target_label: __address__
      - source_labels:
        - __param_target
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels:
        - __meta_kubernetes_namespace
        target_label: kubernetes_namespace
      - source_labels:
        - __meta_kubernetes_service_name
        target_label: kubernetes_name
    - job_name: kubernetes-pods
      kubernetes_sd_configs:
      - role: pod
      relabel_configs:
      - action: keep
        regex: true
        source_labels:
        - __meta_kubernetes_pod_annotation_prometheus_io_scrape
      - action: replace
        regex: (.+)
        source_labels:
        - __meta_kubernetes_pod_annotation_prometheus_io_path
        target_label: __metrics_path__
      - action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        source_labels:
        - __address__
        - __meta_kubernetes_pod_annotation_prometheus_io_port
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - action: replace
        source_labels:
        - __meta_kubernetes_namespace
        target_label: kubernetes_namespace
      - action: replace
        source_labels:
        - __meta_kubernetes_pod_name
        target_label: kubernetes_pod_name
    - job_name: kubernetes-pods-slow
      kubernetes_sd_configs:
      - role: pod
      relabel_configs:
      - action: keep
        regex: true
        source_labels:
        - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
      - action: replace
        regex: (.+)
        source_labels:
        - __meta_kubernetes_pod_annotation_prometheus_io_path
        target_label: __metrics_path__
      - action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        source_labels:
        - __address__
        - __meta_kubernetes_pod_annotation_prometheus_io_port
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - action: replace
        source_labels:
        - __meta_kubernetes_namespace
        target_label: kubernetes_namespace
      - action: replace
        source_labels:
        - __meta_kubernetes_pod_name
        target_label: kubernetes_pod_name
      scrape_interval: 5m
      scrape_timeout: 30s
    
    alerting:
      alertmanagers:
      - kubernetes_sd_configs:
          - role: pod
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace]
          regex: default
          action: keep
        - source_labels: [__meta_kubernetes_pod_label_app]
          regex: prometheus
          action: keep
        - source_labels: [__meta_kubernetes_pod_label_component]
          regex: alertmanager
          action: keep
        - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_probe]
          regex: .*
          action: keep
        - source_labels: [__meta_kubernetes_pod_container_port_number]
          regex:
          action: drop
  recording_rules.yml: |
    {}
    
  rules: |
    {}

server-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    component: "server"
    app: prometheus
  name: prometheus-server
  namespace: monitoring
spec:
  selector:
    matchLabels:
      component: "server"
      app: prometheus
  replicas: 1
  template:
    metadata:
      labels:
        component: "server"
        app: prometheus
    spec:
      serviceAccountName: prometheus-server
      containers:
        - name: prometheus-server-configmap-reload
          image: "jimmidyson/configmap-reload:v0.4.0"
          imagePullPolicy: "IfNotPresent"
          args:
            - --volume-dir=/etc/config
            - --webhook-url=http://127.0.0.1:9090/-/reload
          volumeMounts:
            - name: config-volume
              mountPath: /etc/config
              readOnly: true
        - name: prometheus-server
          image: "prom/prometheus:v2.20.1"
          imagePullPolicy: "IfNotPresent"
          args:
            - --storage.tsdb.retention.time=15d
            - --config.file=/etc/config/prometheus.yml
            - --storage.tsdb.path=/data
            - --web.console.libraries=/etc/prometheus/console_libraries
            - --web.console.templates=/etc/prometheus/consoles
            - --web.enable-lifecycle
          ports:
            - containerPort: 9090
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9090
            initialDelaySeconds: 30
            periodSeconds: 5
            timeoutSeconds: 30
            failureThreshold: 3
            successThreshold: 1
          livenessProbe:
            httpGet:
              path: /-/healthy
              port: 9090
            initialDelaySeconds: 30
            periodSeconds: 15
            timeoutSeconds: 30
            failureThreshold: 3
            successThreshold: 1
          volumeMounts:
            - name: config-volume
              mountPath: /etc/config
            - name: storage-volume
              mountPath: /data
              subPath: ""
      securityContext:
        fsGroup: 65534
        runAsGroup: 65534
        runAsNonRoot: true
        runAsUser: 65534
      terminationGracePeriodSeconds: 300
      volumes:
        - name: config-volume
          configMap:
            name: prometheus-server
        - name: storage-volume
          persistentVolumeClaim:
            claimName: prometheus-server

kube-state-metrics-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: monitoring
  labels:
    app.kubernetes.io/name: kube-state-metrics
spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  replicas: 1
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kube-state-metrics
    spec:
      hostNetwork: false
      serviceAccountName: kube-state-metrics
      securityContext:
        fsGroup: 65534
        runAsGroup: 65534
        runAsUser: 65534
      containers:
      - name: kube-state-metrics
        args:
        - --collectors=certificatesigningrequests
        - --collectors=configmaps
        - --collectors=cronjobs
        - --collectors=daemonsets
        - --collectors=deployments
        - --collectors=endpoints
        - --collectors=horizontalpodautoscalers
        - --collectors=ingresses
        - --collectors=jobs
        - --collectors=limitranges
        - --collectors=mutatingwebhookconfigurations
        - --collectors=namespaces
        - --collectors=networkpolicies
        - --collectors=nodes
        - --collectors=persistentvolumeclaims
        - --collectors=persistentvolumes
        - --collectors=poddisruptionbudgets
        - --collectors=pods
        - --collectors=replicasets
        - --collectors=replicationcontrollers
        - --collectors=resourcequotas
        - --collectors=secrets
        - --collectors=services
        - --collectors=statefulsets
        - --collectors=storageclasses
        - --collectors=validatingwebhookconfigurations
        - --collectors=volumeattachments
        imagePullPolicy: IfNotPresent
        image: "quay.io/coreos/kube-state-metrics:v1.9.7"
        ports:
        - containerPort: 8080
        livenessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
        readinessProbe:
          httpGet:
            path: /
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5

alertmanager-cm.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
  namespace: monitoring
data:
  alertmanager.yml: |
    global: {}
    receivers:
    - name: default-receiver
    route:
      group_interval: 5m
      group_wait: 10s
      receiver: default-receiver
      repeat_interval: 3h

alertmanager-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    component: "alertmanager"
    app: prometheus
  name: prometheus-alertmanager
  namespace: monitoring
spec:
  selector:
    matchLabels:
      component: "alertmanager"
      app: prometheus
  replicas: 1
  template:
    metadata:
      labels:
        component: "alertmanager"
        app: prometheus
    spec:
      serviceAccountName: prometheus-alertmanager
      containers:
        - name: prometheus-alertmanager
          image: "prom/alertmanager:v0.21.0"
          imagePullPolicy: "IfNotPresent"
          env:
            - name: POD_IP
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: status.podIP
          args:
            - --config.file=/etc/config/alertmanager.yml
            - --storage.path=/data
            - --cluster.advertise-address=$(POD_IP):6783
            - --web.external-url=http://localhost:9093
          ports:
            - containerPort: 9093
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9093
            initialDelaySeconds: 30
            timeoutSeconds: 30
          volumeMounts:
            - name: config-volume
              mountPath: /etc/config
            - name: storage-volume
              mountPath: "/data"
              subPath: ""
        - name: prometheus-alertmanager-configmap-reload
          image: "jimmidyson/configmap-reload:v0.4.0"
          imagePullPolicy: "IfNotPresent"
          args:
            - --volume-dir=/etc/config
            - --webhook-url=http://127.0.0.1:9093/-/reload
          volumeMounts:
            - name: config-volume
              mountPath: /etc/config
              readOnly: true
      securityContext:
        fsGroup: 65534
        runAsGroup: 65534
        runAsNonRoot: true
        runAsUser: 65534
      volumes:
        - name: config-volume
          configMap:
            name: prometheus-alertmanager
        - name: storage-volume
          persistentVolumeClaim:
            claimName: prometheus-alertmanager

node-export-ds.yaml

apiVersion: apps/v1
kind: DaemonSet
metadata:
  labels:
    component: "node-exporter"
    app: prometheus
  name: prometheus-node-exporter
  namespace: monitoring
spec:
  selector:
    matchLabels:
      component: "node-exporter"
      app: prometheus
  updateStrategy:
    type: RollingUpdate
    
  template:
    metadata:
      labels:
        component: "node-exporter"
        app: prometheus
    spec:
      serviceAccountName: prometheus-node-exporter
      containers:
        - name: prometheus-node-exporter
          image: "prom/node-exporter:v1.0.1"
          imagePullPolicy: "IfNotPresent"
          args:
            - --path.procfs=/host/proc
            - --path.sysfs=/host/sys
            - --web.listen-address=:9100
          ports:
            - name: metrics
              containerPort: 9100
              hostPort: 9100
          resources:
            {}
            
          volumeMounts:
            - name: proc
              mountPath: /host/proc
              readOnly:  true
            - name: sys
              mountPath: /host/sys
              readOnly: true
      tolerations:
      - effect: NoSchedule
      hostNetwork: true
      hostPID: true
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: sys
          hostPath:
            path: /sys

pushgateway-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    component: "pushgateway"
    app: prometheus
  name: prometheus-pushgateway
  namespace: monitoring
spec:
  selector:
    matchLabels:
      component: "pushgateway"
      app: prometheus
  replicas: 1
  template:
    metadata:
      labels:
        component: "pushgateway"
        app: prometheus
    spec:
      serviceAccountName: prometheus-pushgateway
      containers:
        - name: prometheus-pushgateway
          image: "prom/pushgateway:v1.2.0"
          imagePullPolicy: "IfNotPresent"
          ports:
            - containerPort: 9091
          livenessProbe:
            httpGet:
              path: /-/healthy
              port: 9091
            initialDelaySeconds: 10
            timeoutSeconds: 10
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9091
            initialDelaySeconds: 10
            timeoutSeconds: 10
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534

grafana-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: monitoring
  labels:
    app: grafana
spec:
  selector:
    matchLabels:
      component: "grafana"
      app: prometheus
  template:
    metadata:
      labels:
        component: "grafana"
        app: prometheus
    spec:
      containers:
      - name: grafana
        image: grafana/grafana:7.1.3
        imagePullPolicy: IfNotPresent
        ports:
        - name: grafana
          containerPort: 3000
        env:
        - name: GF_SECURITY_ADMIN_USER
          value: admin
        - name: GF_SECURITY_ADMIN_PASSWORD
          value: admin321
        readinessProbe:
          failureThreshold: 10
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          initialDelaySeconds: 60
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        livenessProbe:
          failureThreshold: 3
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 1
        resources:
          requests:
            cpu: 100m
            memory: 256Mi
        volumeMounts:
        - name: storage
          mountPath: /var/lib/grafana
          subPath: grafana
      securityContext:
        fsGroup: 472
        runAsUser: 472
      volumes:
      - name: storage
        persistentVolumeClaim:
          claimName: prometheus-grafana

以上yaml文件你可能需要做相应的修改,尤其是pv、pvc相关的内容,根据实际情况做修改。

将修改后的yaml文件单独放到一个目录,应用yaml

kubectl apply -f .

查看部署状态:

[root@k8s-master01 prometheus]# kubectl -n monitoring get pod -o wide
NAME                                       READY   STATUS    RESTARTS   AGE   IP              NODE         NOMINATED NODE   READINESS GATES
grafana-7769d5d799-plkl9                   1/1     Running   0          32m   10.244.3.9      k8s-node03   <none>           <none>
kube-state-metrics-74c74464d-xptvl         1/1     Running   0          32m   10.244.2.9      k8s-node02   <none>           <none>
prometheus-alertmanager-67c487c57c-fkn5h   2/2     Running   0          32m   10.244.1.4      k8s-node01   <none>           <none>
prometheus-node-exporter-78c8b             1/1     Running   0          32m   172.16.135.12   k8s-node02   <none>           <none>
prometheus-node-exporter-7j6ht             1/1     Running   0          32m   172.16.135.11   k8s-node01   <none>           <none>
prometheus-node-exporter-znvdj             1/1     Running   0          32m   172.16.135.13   k8s-node03   <none>           <none>
prometheus-pushgateway-794f6bdc9f-wglhm    1/1     Running   0          32m   10.244.3.8      k8s-node03   <none>           <none>
prometheus-server-6fc987c88-2w9h8          2/2     Running   0          32m   10.244.2.10     k8s-node02   <none>           <none>

查看svc,查看访问端口:

[root@k8s-master01 prometheus]# kubectl -n monitoring get svc
NAME                       TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
kube-state-metrics         ClusterIP   10.110.51.133   <none>        8080/TCP         35m
prometheus-alertmanager    ClusterIP   10.102.107.59   <none>        80/TCP           35m
prometheus-grafana         NodePort    10.96.117.212   <none>        3000:31213/TCP   35m
prometheus-node-exporter   ClusterIP   None            <none>        9100/TCP         35m
prometheus-pushgateway     ClusterIP   10.111.57.240   <none>        9091/TCP         35m
prometheus-server          NodePort    10.101.157.77   <none>        80:31004/TCP     35m

可以看到k8s为grafana分配的的NodePort是31213,此时通过http://NodeIP:NodePort即可访问grafana了,我这里是http://172.16.135.10:31213

grafana登录.png

  • 帐号:admin
  • 密码:admin321

帐号密码是在grafana-deploy.yaml文件中定义的

grafana连接prometheus

登录grafana后,点击设置(Configuration) -> 添加数据源(Add data source)

选择Prometheus,URL填入http://prometheus-server,点击Save & Test

点击Dashboard,导入3个默认图表

grafana导入默认图表.png

图表的设计和编码有一定的门槛,grafana社区提供了人们做好的一些图表供我们使用,访问这个链接,即可查看和获取。

这里推荐几个好用的图表:

  • 7249
  • 9797
  • 10000
  • 6588

以上ID,可以直接在grafana的dashboard manager页面导入。

遗留事项

至此,prometheus监控平台搭建完成,但是还远没有结束,至少还有以下问题待解决

  • 如何监控业务
  • 如何告警

标签: none

添加新评论