Update nexus: fix conflicts and sync local changes
This commit is contained in:
@@ -1,72 +1,72 @@
|
||||
# Monitoring-Database_686083870
|
||||
## Introduction
|
||||
|
||||
This page presents all the monitoring used to cover database level monitoring, not limited to DB instance level, application level monitoring.
|
||||
|
||||
## Detailed monitors
|
||||
|
||||
#### Instance level monitoring
|
||||
|
||||
- Locks - blocked sessions(TBD) - check every 1 minute
|
||||
- Graph (To be reviewed)
|
||||
```
|
||||
SELECT count(bl.pid) AS blocked_pid_count FROM pg_catalog.pg_locks bl JOIN pg_catalog.pg_stat_activity a ON a.pid = bl.pid JOIN pg_catalog.pg_locks kl ON kl.transactionid = bl.transactionid AND kl.pid != bl.pid JOIN pg_catalog.pg_stat_activity ka ON ka.pid = kl.pid WHERE NOT bl.granted and a.query_start <= (now() - interval '5 minutes');
|
||||
```
|
||||
- Table
|
||||
```
|
||||
SELECT bl.pid AS blocked_pid, a.usename AS blocked_user, a.query_start AS blocked_start_time , a.query AS blocked_statement, now() - a.query_start AS blocked_duration,ka.query AS blocking_statement, ka.query_start AS blocking_start_time , now() - ka.query_start AS blocking_duration, kl.pid AS blocking_pid, ka.usename AS blocking_user FROM pg_catalog.pg_locks bl JOIN pg_catalog.pg_stat_activity a ON a.pid = bl.pid JOIN pg_catalog.pg_locks kl ON kl.transactionid = bl.transactionid AND kl.pid != bl.pid JOIN pg_catalog.pg_stat_activity ka ON ka.pid = kl.pid WHERE NOT bl.granted and a.query_start <= (now() - interval '5 minutes');
|
||||
```
|
||||
- Long active queries (TBD) - check every 10 minutes
|
||||
- Graph
|
||||
```
|
||||
SELECT count(pid) AS Long_Query_count FROM pg_catalog.pg_stat_activity WHERE (now() - pg_catalog.pg_stat_activity.query_start) > interval '30 minutes' and state = 'active';
|
||||
```
|
||||
- Table
|
||||
```
|
||||
SELECT pid, now() - pg_catalog.pg_stat_activity.query_start AS duration, state FROM pg_catalog.pg_stat_activity WHERE (now() - pg_catalog.pg_stat_activity.query_start) > interval '30 minutes' and state = 'active';
|
||||
```
|
||||
- Capture RDS top 10 query (TBD) - check every 10 minutes
|
||||
1. Clean stat\_statement bi-weekly
|
||||
2. capture during runtime if CPU is more than 97% for 60 mins
|
||||
3. Table
|
||||
```
|
||||
Select (select datname from pg_database where oid=dbid) as db_name, query as short_query, round(total_time::numeric, 2) as total_time, calls, round(mean_time::numeric, 2) as mean, round((100 * total_time / sum(total_time::numeric) over ())::numeric, 2) as percentage_overall from pg_stat_statements order by total_time desc limit 20;
|
||||
```
|
||||
4. Graph (Optional)
|
||||
```
|
||||
Select (select datname from pg_database where oid=dbid) as db_name, query as short_query, round((100 * total_time / sum(total_time::numeric) over ())::numeric, 2) as percentage_overall from pg_stat_statements order by total_time desc limit 20;
|
||||
```
|
||||
- Dead tuple - logical database level (TBD) - check every 10 minutes
|
||||
- Table
|
||||
```
|
||||
-- Top bloating tables
|
||||
select relname, n_dead_tup, last_vacuum, last_autovacuum from
|
||||
pg_catalog.pg_stat_all_tables
|
||||
where n_dead_tup > 0 order by n_dead_tup desc limit 20;
|
||||
```
|
||||
- Graph (Optional)
|
||||
```
|
||||
-- Top bloating tables
|
||||
select relname, n_dead_tup
|
||||
pg_catalog.pg_stat_all_tables
|
||||
where n_dead_tup > 0 order by n_dead_tup desc limit 20;
|
||||
```
|
||||
|
||||
#### Database level customer metrics
|
||||
|
||||
1. NativeSACM Transaction Context Queue - check every 10 minutes
|
||||
```
|
||||
select count(*) from transaction_context_$tenant where entity::jsonb->>'entity_type' in('Device','SystemElement','ServiceComponent','ActualService') ;
|
||||
```
|
||||
2. NativeSACM Transaction Context Queue retries - check every 10 minutes
|
||||
```
|
||||
select count(*) from transaction_context_$tenant where retry_count=2 and entity::jsonb->>'entity_type' in('Device','SystemElement','ServiceComponent','ActualService') ;
|
||||
```
|
||||
3. SLT Job queue - check every 10 minutes
|
||||
```
|
||||
select count(1) from transaction_context_$tenant a, transaction_etl_job_$tenant b where a.transaction_timestamp > b.last_taken and a.flag0 & 1 = 1 and b.job_name = 'SLT';
|
||||
```
|
||||
4. SLT Delay time (This may need to be removed if it's the same as SLT Job Queue) - check every 10 minutes
|
||||
```
|
||||
SELECT now() - last_taken as delay FROM maas_admin.transaction_etl_job_$tenant where job_name = 'SLT';"
|
||||
```
|
||||
# Monitoring-Database_686083870
|
||||
## Introduction
|
||||
|
||||
This page presents all the monitoring used to cover database level monitoring, not limited to DB instance level, application level monitoring.
|
||||
|
||||
## Detailed monitors
|
||||
|
||||
#### Instance level monitoring
|
||||
|
||||
- Locks - blocked sessions(TBD) - check every 1 minute
|
||||
- Graph (To be reviewed)
|
||||
```
|
||||
SELECT count(bl.pid) AS blocked_pid_count FROM pg_catalog.pg_locks bl JOIN pg_catalog.pg_stat_activity a ON a.pid = bl.pid JOIN pg_catalog.pg_locks kl ON kl.transactionid = bl.transactionid AND kl.pid != bl.pid JOIN pg_catalog.pg_stat_activity ka ON ka.pid = kl.pid WHERE NOT bl.granted and a.query_start <= (now() - interval '5 minutes');
|
||||
```
|
||||
- Table
|
||||
```
|
||||
SELECT bl.pid AS blocked_pid, a.usename AS blocked_user, a.query_start AS blocked_start_time , a.query AS blocked_statement, now() - a.query_start AS blocked_duration,ka.query AS blocking_statement, ka.query_start AS blocking_start_time , now() - ka.query_start AS blocking_duration, kl.pid AS blocking_pid, ka.usename AS blocking_user FROM pg_catalog.pg_locks bl JOIN pg_catalog.pg_stat_activity a ON a.pid = bl.pid JOIN pg_catalog.pg_locks kl ON kl.transactionid = bl.transactionid AND kl.pid != bl.pid JOIN pg_catalog.pg_stat_activity ka ON ka.pid = kl.pid WHERE NOT bl.granted and a.query_start <= (now() - interval '5 minutes');
|
||||
```
|
||||
- Long active queries (TBD) - check every 10 minutes
|
||||
- Graph
|
||||
```
|
||||
SELECT count(pid) AS Long_Query_count FROM pg_catalog.pg_stat_activity WHERE (now() - pg_catalog.pg_stat_activity.query_start) > interval '30 minutes' and state = 'active';
|
||||
```
|
||||
- Table
|
||||
```
|
||||
SELECT pid, now() - pg_catalog.pg_stat_activity.query_start AS duration, state FROM pg_catalog.pg_stat_activity WHERE (now() - pg_catalog.pg_stat_activity.query_start) > interval '30 minutes' and state = 'active';
|
||||
```
|
||||
- Capture RDS top 10 query (TBD) - check every 10 minutes
|
||||
1. Clean stat\_statement bi-weekly
|
||||
2. capture during runtime if CPU is more than 97% for 60 mins
|
||||
3. Table
|
||||
```
|
||||
Select (select datname from pg_database where oid=dbid) as db_name, query as short_query, round(total_time::numeric, 2) as total_time, calls, round(mean_time::numeric, 2) as mean, round((100 * total_time / sum(total_time::numeric) over ())::numeric, 2) as percentage_overall from pg_stat_statements order by total_time desc limit 20;
|
||||
```
|
||||
4. Graph (Optional)
|
||||
```
|
||||
Select (select datname from pg_database where oid=dbid) as db_name, query as short_query, round((100 * total_time / sum(total_time::numeric) over ())::numeric, 2) as percentage_overall from pg_stat_statements order by total_time desc limit 20;
|
||||
```
|
||||
- Dead tuple - logical database level (TBD) - check every 10 minutes
|
||||
- Table
|
||||
```
|
||||
-- Top bloating tables
|
||||
select relname, n_dead_tup, last_vacuum, last_autovacuum from
|
||||
pg_catalog.pg_stat_all_tables
|
||||
where n_dead_tup > 0 order by n_dead_tup desc limit 20;
|
||||
```
|
||||
- Graph (Optional)
|
||||
```
|
||||
-- Top bloating tables
|
||||
select relname, n_dead_tup
|
||||
pg_catalog.pg_stat_all_tables
|
||||
where n_dead_tup > 0 order by n_dead_tup desc limit 20;
|
||||
```
|
||||
|
||||
#### Database level customer metrics
|
||||
|
||||
1. NativeSACM Transaction Context Queue - check every 10 minutes
|
||||
```
|
||||
select count(*) from transaction_context_$tenant where entity::jsonb->>'entity_type' in('Device','SystemElement','ServiceComponent','ActualService') ;
|
||||
```
|
||||
2. NativeSACM Transaction Context Queue retries - check every 10 minutes
|
||||
```
|
||||
select count(*) from transaction_context_$tenant where retry_count=2 and entity::jsonb->>'entity_type' in('Device','SystemElement','ServiceComponent','ActualService') ;
|
||||
```
|
||||
3. SLT Job queue - check every 10 minutes
|
||||
```
|
||||
select count(1) from transaction_context_$tenant a, transaction_etl_job_$tenant b where a.transaction_timestamp > b.last_taken and a.flag0 & 1 = 1 and b.job_name = 'SLT';
|
||||
```
|
||||
4. SLT Delay time (This may need to be removed if it's the same as SLT Job Queue) - check every 10 minutes
|
||||
```
|
||||
SELECT now() - last_taken as delay FROM maas_admin.transaction_etl_job_$tenant where job_name = 'SLT';"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user