简介
本文档介绍在用户网络中注意到的节点导出器磁盘已满问题。
背景
当对Cluster Manager Common Execution Environment(CEE)执行审核时,审核结果表明节点导出器磁盘已满。
问题
存在严重性严重性警报情况,因为预计在未来24小时内会出现磁盘已满情况,CEE注意到此警报:
"节点导出器cee03/node-exporter-4dd4a4dd4a的设备/dev/sda3预计在未来24小时内已满"
分析
报告的警报位于CEE上,跟踪机架的硬件问题,并预测未来24小时内将出现全磁盘情况。
cisco@deployer-cm-primary:~$ kubectl get pods -A -o wide | grep node
cee03 node-exporter-4dd4a4dd4a 1/1 Running 1 111d 10.10.1.1 deployer-cm-primary <none> <none>
root@deployer-cm-primary:/# df -h
Filesystem Size Used Avail Use% Mounted on
overlay 568G 171G 368G 32% /
tmpfs 64M 0 64M 0% /dev
tmpfs 189G 0 189G 0% /sys/fs/cgroup
tmpfs 189G 0 189G 0% /host/sys/fs/cgroup
/dev/sda1 9.8G 3.5G 5.9G 37% /host/root
udev 189G 0 189G 0% /host/root/dev
tmpfs 189G 0 189G 0% /host/root/dev/shm
tmpfs 38G 15M 38G 1% /host/root/run
tmpfs 5.0M 0 5.0M 0% /host/root/run/lock
/dev/sda3 71G 67G 435M 100% /host/root/var/log
执行审核时,它似乎会填满/dev/sda3磁盘。
root@deployer-cm-primary:/host/root/var/log# du -h --max-depth=1
76M ./sysstat
16K ./lost+found
4.0K ./containers
4.0K ./landscape
9.3M ./calico
1.1G ./apiserver
808K ./pods
5.6G ./journal
60G ./audit
36K ./apt
67G .
对审核的检查显示它会保留日志,因此,可能会发生导出器节点磁盘已满的服务器条件。
cisco@deployer-cm-primary:~$ sudo cat /etc/audit/auditd.conf
#
# This file controls the configuration of the audit daemon
#
local_events = yes
write_logs = yes
log_file = /var/log/audit/audit.log
log_group = adm
log_format = RAW
flush = INCREMENTAL_ASYNC
freq = 50
max_log_file = 8
num_logs = 5
priority_boost = 4
disp_qos = lossy
dispatcher = /sbin/audispd
name_format = NONE
##name = mydomain
max_log_file_action = keep_logs
space_left = 75
space_left_action = email
verify_email = yes
action_mail_acct = root
admin_space_left = 50
admin_space_left_action = halt
disk_full_action = SUSPEND
disk_error_action = SUSPEND
use_libwrap = yes
##tcp_listen_port = 60
tcp_listen_queue = 5
tcp_max_per_addr = 1
##tcp_client_ports = 1024-65535
tcp_client_max_idle = 0
enable_krb5 = no
krb5_principal = auditd
##krb5_key_file = /etc/audit/audit.key
distribute_network = no
cisco@deployer-cm-primary:~$
解决方案
在Deployer-cm-primary和Deployer-cm-secondary上执行下面列出的命令代码,以修复潜在的节点导出器磁盘满情况。
sudo vim /etc/audit/auditd.conf
然后,使用旁边列出的代码将内部文件从keep_logs更改为旋转。
max_log_file_action = rotate
更改代码后,重新启动服务。
sudo systemctl restart auditd.service
验证是否已删除严重警报。