Merge branch 'main' of ssh://192.168.3.189:2222/admin/nexus
This commit is contained in:
448
openclaw/knowledgebase/monitor-stack-deployment.md
Normal file
448
openclaw/knowledgebase/monitor-stack-deployment.md
Normal file
@@ -0,0 +1,448 @@
|
||||
# Ubuntu2 监控栈部署笔记 (Telegraf + InfluxDB + Grafana)
|
||||
|
||||
> 部署时间:2026-03-28
|
||||
> 目的:收集服务器性能指标并通过 Grafana 可视化历史数据
|
||||
|
||||
---
|
||||
|
||||
## 一、架构概述
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Ubuntu2 │
|
||||
│ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ Telegraf │───►│ InfluxDB │───►│ Grafana │ │
|
||||
│ │ (采集器) │ │ (时序数据库) │ │ (可视化) │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ 系统指标:CPU、内存、 http://192.168.3.45:3000
|
||||
│ 磁盘、网络、负载等
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 二、部署环境
|
||||
|
||||
| 项目 | 信息 |
|
||||
|------|------|
|
||||
| 服务器 | Ubuntu2 (192.168.3.45) |
|
||||
| Docker | 已安装 |
|
||||
| 部署目录 | `~/docker/monitor-stack/` |
|
||||
| 访问地址 | Grafana: http://192.168.3.45:3000 |
|
||||
|
||||
---
|
||||
|
||||
## 三、为什么不使用 Glances 原生 Export
|
||||
|
||||
### 尝试过的方案
|
||||
|
||||
1. **Glances + InfluxDB2 Export**
|
||||
- 问题:Glances latest-full 镜像有兼容性问题
|
||||
- 错误:`influxdb2_host` 参数不被识别
|
||||
- 原因:Glances Docker 镜像的 export 模块加载问题
|
||||
|
||||
2. **结论**
|
||||
- Glances 官方 Docker 镜像与 InfluxDB2 export 存在兼容性问题
|
||||
- 改用 Telegraf 作为替代方案,更加稳定可靠
|
||||
|
||||
---
|
||||
|
||||
## 四、最终方案:Telegraf + InfluxDB + Grafana
|
||||
|
||||
### 4.1 目录结构
|
||||
|
||||
```
|
||||
~/docker/monitor-stack/
|
||||
├── docker-compose.yml # 容器编排配置
|
||||
├── telegraf.conf # Telegraf 采集配置
|
||||
├── glances.conf # Glances 配置(保留,未使用)
|
||||
└── grafana/
|
||||
├── provisioning/
|
||||
│ ├── datasources/
|
||||
│ │ └── influxdb.yml # Grafana 数据源
|
||||
│ └── dashboards/
|
||||
│ └── dashboards.yml # Dashboard provisioning
|
||||
└── dashboards/
|
||||
└── telegraf-system.json # 自定义 Dashboard
|
||||
```
|
||||
|
||||
### 4.2 docker-compose.yml
|
||||
|
||||
```yaml
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
# InfluxDB 时序数据库
|
||||
influxdb:
|
||||
image: influxdb:2.7-alpine
|
||||
container_name: influxdb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8086:8086"
|
||||
environment:
|
||||
- DOCKER_INFLUXDB_INIT_MODE=setup
|
||||
- DOCKER_INFLUXDB_INIT_USERNAME=admin
|
||||
- DOCKER_INFLUXDB_INIT_PASSWORD=admin123
|
||||
- DOCKER_INFLUXDB_INIT_ORG=home-lab
|
||||
- DOCKER_INFLUXDB_INIT_BUCKET=server-metrics
|
||||
- DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=my-super-secret-admin-token
|
||||
volumes:
|
||||
- influxdb-data:/var/lib/influxdb2
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:8086/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
networks:
|
||||
- monitor-network
|
||||
|
||||
# Telegraf 采集器
|
||||
telegraf:
|
||||
image: telegraf:1.31-alpine
|
||||
container_name: telegraf
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
influxdb:
|
||||
condition: service_started
|
||||
environment:
|
||||
- HOST_PROC=/host/proc
|
||||
- HOST_SYS=/host/sys
|
||||
- HOST_ETC=/host/etc
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /etc:/host/etc:ro
|
||||
- ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
|
||||
networks:
|
||||
- monitor-network
|
||||
command: telegraf --config /etc/telegraf/telegraf.conf
|
||||
|
||||
# Grafana 可视化仪表板
|
||||
grafana:
|
||||
image: grafana/grafana:11.3.0-ubuntu
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin123
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
depends_on:
|
||||
- influxdb
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
networks:
|
||||
- monitor-network
|
||||
|
||||
volumes:
|
||||
influxdb-data:
|
||||
driver: local
|
||||
grafana-data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
monitor-network:
|
||||
name: monitor-network
|
||||
driver: bridge
|
||||
```
|
||||
|
||||
### 4.3 telegraf.conf
|
||||
|
||||
```bash
|
||||
# Telegraf 采集配置
|
||||
|
||||
[agent]
|
||||
interval = "10s"
|
||||
round_interval = true
|
||||
metric_batch_size = 1000
|
||||
metric_buffer_limit = 10000
|
||||
flush_interval = "10s"
|
||||
debug = false
|
||||
quiet = false
|
||||
|
||||
# CPU 指标
|
||||
[[inputs.cpu]]
|
||||
percpu = true
|
||||
totalcpu = true
|
||||
|
||||
# 内存指标
|
||||
[[inputs.mem]]
|
||||
|
||||
# 磁盘使用
|
||||
[[inputs.disk]]
|
||||
|
||||
# 磁盘 I/O
|
||||
[[inputs.diskio]]
|
||||
|
||||
# 系统指标
|
||||
[[inputs.system]]
|
||||
|
||||
# 网络接口
|
||||
[[inputs.net]]
|
||||
|
||||
# 输出到 InfluxDB v2
|
||||
[[outputs.influxdb_v2]]
|
||||
urls = ["http://influxdb:8086"]
|
||||
token = "my-super-secret-admin-token"
|
||||
organization = "home-lab"
|
||||
bucket = "server-metrics"
|
||||
```
|
||||
|
||||
### 4.4 Grafana 数据源配置 (grafana/provisioning/datasources/influxdb.yml)
|
||||
|
||||
```yaml
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: InfluxDB
|
||||
type: influxdb
|
||||
access: proxy
|
||||
url: http://influxdb:8086
|
||||
jsonData:
|
||||
version: Flux
|
||||
organization: home-lab
|
||||
defaultBucket: server-metrics
|
||||
tlsSkipVerify: true
|
||||
secureJsonData:
|
||||
token: my-super-secret-admin-token
|
||||
isDefault: true
|
||||
editable: false
|
||||
```
|
||||
|
||||
### 4.5 Dashboard Provisioning (grafana/provisioning/dashboards/dashboards.yml)
|
||||
|
||||
```yaml
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Server Metrics'
|
||||
orgId: 1
|
||||
folder: 'Server Metrics'
|
||||
folderUid: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 五、部署命令
|
||||
|
||||
### 5.1 启动监控栈
|
||||
|
||||
```bash
|
||||
cd ~/docker/monitor-stack
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### 5.2 检查服务状态
|
||||
|
||||
```bash
|
||||
docker compose ps
|
||||
```
|
||||
|
||||
### 5.3 查看日志
|
||||
|
||||
```bash
|
||||
# 查看所有服务日志
|
||||
docker compose logs
|
||||
|
||||
# 查看特定服务日志
|
||||
docker compose logs telegraf
|
||||
docker compose logs influxdb
|
||||
docker compose logs grafana
|
||||
```
|
||||
|
||||
### 5.4 重启服务
|
||||
|
||||
```bash
|
||||
docker compose restart <服务名>
|
||||
```
|
||||
|
||||
### 5.5 停止服务
|
||||
|
||||
```bash
|
||||
docker compose down
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 六、访问信息
|
||||
|
||||
| 服务 | 地址 | 用户名 | 密码 |
|
||||
|------|------|--------|------|
|
||||
| Grafana | http://192.168.3.45:3000 | admin | admin123 |
|
||||
| InfluxDB | http://192.168.3.45:8086 | admin | admin123 |
|
||||
|
||||
---
|
||||
|
||||
## 七、Dashboard 介绍
|
||||
|
||||
### 7.1 Ubuntu2 系统监控
|
||||
|
||||
包含以下 Panel:
|
||||
|
||||
| Panel | 类型 | 说明 |
|
||||
|-------|------|------|
|
||||
| CPU 使用率 | Gauge | 实时 CPU 使用百分比 |
|
||||
| 内存使用率 | Gauge | 实时内存使用百分比 |
|
||||
| CPU 历史趋势 | Line Chart | 过去 1 小时 CPU 趋势 |
|
||||
| 内存历史趋势 | Line Chart | 过去 1 小时内存趋势 |
|
||||
| 网络流量 | Line Chart | 网卡接收/发送速率 |
|
||||
| 系统负载 (1分钟) | Line Chart | 1分钟平均负载 |
|
||||
| 磁盘 Inodes | Bar Gauge | Inodes 使用百分比 |
|
||||
| 系统负载趋势 | Line Chart | 1/5/15分钟负载对比 |
|
||||
|
||||
### 7.2 数据刷新
|
||||
|
||||
- Telegraf 采集间隔:10 秒
|
||||
- Grafana 刷新间隔:10 秒
|
||||
- 数据保留:InfluxDB 默认永久(建议后续设置保留策略)
|
||||
|
||||
---
|
||||
|
||||
## 八、InfluxDB 信息
|
||||
|
||||
| 项目 | 值 |
|
||||
|------|-----|
|
||||
| 组织 | home-lab |
|
||||
| Bucket | server-metrics |
|
||||
| Token | my-super-secret-admin-token |
|
||||
|
||||
### 8.1 查询数据
|
||||
|
||||
使用 Flux 查询语言:
|
||||
|
||||
```bash
|
||||
# 查询 CPU 使用率
|
||||
curl -s -H "Authorization: Token my-super-secret-admin-token" \
|
||||
-H "Content-Type: application/vnd.flux" \
|
||||
"http://localhost:8086/api/v2/query?org=home-lab" \
|
||||
-d 'from(bucket:"server-metrics") |> range(start:-1h) |> filter(fn: (r) => r._measurement == "cpu")'
|
||||
```
|
||||
|
||||
### 8.2 已采集的 Measurements
|
||||
|
||||
- `cpu` - CPU 指标
|
||||
- `mem` - 内存指标
|
||||
- `disk` - 磁盘使用
|
||||
- `diskio` - 磁盘 I/O
|
||||
- `net` - 网络接口
|
||||
- `system` - 系统负载
|
||||
|
||||
---
|
||||
|
||||
## 九、扩展到其他服务器
|
||||
|
||||
### 9.1 Mac Mini
|
||||
|
||||
```bash
|
||||
# 安装 Telegraf
|
||||
brew install telegraf
|
||||
|
||||
# 创建 telegraf.conf(参考 Ubuntu2 配置)
|
||||
# 修改 output.influxdb_v2.urls 为 http://192.168.3.45:8086
|
||||
|
||||
# 启动 Telegraf
|
||||
telegraf --config telegraf.conf
|
||||
```
|
||||
|
||||
### 9.2 Ubuntu1
|
||||
|
||||
```bash
|
||||
# 安装 Telegraf
|
||||
sudo apt update
|
||||
sudo apt install telegraf
|
||||
|
||||
# 修改 /etc/telegraf/telegraf.conf
|
||||
# 修改 output.influxdb_v2.urls 为 http://192.168.3.45:8086
|
||||
|
||||
# 启动 Telegraf
|
||||
sudo systemctl enable telegraf
|
||||
sudo systemctl start telegraf
|
||||
```
|
||||
|
||||
### 9.3 多主机数据区分
|
||||
|
||||
在 telegraf.conf 中添加 host tag:
|
||||
|
||||
```toml
|
||||
[agent]
|
||||
interval = "10s"
|
||||
hostname = "macmini" # 添加这行区分主机
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 十、故障排除
|
||||
|
||||
### 10.1 Telegraf 无法连接 InfluxDB
|
||||
|
||||
检查:
|
||||
1. Docker 网络是否正确配置
|
||||
2. InfluxDB 是否正常运行:`docker compose ps`
|
||||
3. 查看 Telegraf 日志:`docker compose logs telegraf`
|
||||
|
||||
### 10.2 Grafana 显示 "No Data"
|
||||
|
||||
检查:
|
||||
1. 数据源是否正确配置
|
||||
2. Telegraf 是否正在采集数据
|
||||
3. InfluxDB 是否有数据:`curl` 查询验证
|
||||
|
||||
### 10.3 容器无法启动
|
||||
|
||||
```bash
|
||||
# 查看详细错误
|
||||
docker compose up
|
||||
|
||||
# 重建容器
|
||||
docker compose down
|
||||
docker compose up -d --force-recreate
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 十一、后续优化建议
|
||||
|
||||
1. **设置数据保留策略**
|
||||
- 建议保留 30 天数据,避免磁盘空间耗尽
|
||||
|
||||
2. **添加告警规则**
|
||||
- CPU > 90% 持续 5 分钟
|
||||
- 内存 > 85% 持续 5 分钟
|
||||
|
||||
3. **扩展 Dashboard**
|
||||
- 按需添加更多 Panel
|
||||
- 添加 Docker 容器监控
|
||||
|
||||
4. **定时备份**
|
||||
- 备份 InfluxDB 数据
|
||||
- 备份 Grafana Dashboard JSON
|
||||
|
||||
---
|
||||
|
||||
## 十二、相关文件路径
|
||||
|
||||
- 部署目录:`/home/shenwei/docker/monitor-stack/`
|
||||
- Grafana 数据卷:`grafana-data`
|
||||
- InfluxDB 数据卷:`influxdb-data`
|
||||
- 配置文件:`telegraf.conf`
|
||||
|
||||
---
|
||||
|
||||
*最后更新:2026-03-28*
|
||||
@@ -2,9 +2,11 @@
|
||||
|
||||
## 备份状态表格
|
||||
|
||||
| 日期 | 时间 | 服务器 | 备份文件 | 状态 |
|
||||
| ---------- | ----- | -------- | ----------------------------------- | ---- |
|
||||
| 2026-03-27 | 22:02 | Mac Mini | openclaw-macmini-20260327220222.tar | ✅ 成功 |
|
||||
| 日期 | 时间 | 服务器 | 备份文件 | 状态 |
|
||||
| ---------- | ----- | -------- | ------------------------------------ | ---- |
|
||||
| 2026-03-28 | 22:02 | Mac Mini | openclaw-macmini-20260328220157.tar | ✅ 成功 |
|
||||
| 2026-03-28 | 22:02 | Ubuntu2 | openclaw-ubuntu2-20260328220208.tar | ✅ 成功 |
|
||||
| 2026-03-27 | 22:02 | Mac Mini | openclaw-macmini-20260327220222.tar | ✅ 成功 |
|
||||
| 2026-03-27 | 22:02 | Ubuntu2 | openclaw-ubuntu2-20260327220222.tar | ✅ 成功 |
|
||||
| 2026-03-26 | 22:02 | Mac Mini | openclaw-macmini-20260326220236.tar | ✅ 成功 |
|
||||
| 2026-03-26 | 22:02 | Ubuntu2 | openclaw-ubuntu2-20260326220236.tar | ✅ 成功 |
|
||||
|
||||
Reference in New Issue
Block a user