-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathotnode_backup.sh
More file actions
146 lines (116 loc) · 6.27 KB
/
otnode_backup.sh
File metadata and controls
146 lines (116 loc) · 6.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
#this is a script which backups up otnode (blazegraph.jnl and operationaldb) from one VPS to another using zfs snapshot, send TG messages and validate jnl file at destination
#it expects SRC_POOL and DST_POOL to be created in advance
#blazegraph.jnl to be placed in SRC_POOL
#blame chatGPT if it destroyed your node
#before you move jnl file to the pool ensure you set smaller record size 16k or 8k for you pool like this - zfs set recordsize=16K blzpool. Default 128k does not work well for blazegraph workload
set -euo pipefail
exec > >(tee -a /root/zfs_backup.log) 2>&1
TELEGRAM_BOT_TOKEN="YOUR_TG_BOT_TOKEN"
TELEGRAM_CHAT_ID="YOUR_CHAT_ID"
SRC_POOL="LOCAL_POOL_NAME"
DST_POOL="REMOTE_POOL_NAME"
DST_HOST="YOUR_REMOTE_HOST_DOMAIN_NAME_OR_IP"
SNAP_NAME="bk_$(date +%F_%H-%M-%S)"
notify_telegram() {
local msg="$1"
echo "$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d parse_mode="Markdown" \
-d text="${msg}" >/dev/null
}
trap 'notify_telegram "❌ *ZFS backup FAILED* on $(hostname) at $(date). Check /root/zfs_backup.log"' ERR
notify_telegram "🚀 *ZFS backup started* on $(hostname) at $(date)"
notify_telegram "stopping otnode..."
systemctl stop otnode
sleep 5
notify_telegram "starting SQL dump..."
mysqldump -u root -padmin operationaldb > /${SRC_POOL}/operationaldb_backup.sql
#capture number of triples in the blazegraph at the source
mkdir -p /root/blaze_count
curl --fail -s -X POST http://localhost:9999/blazegraph/namespace/dkg/sparql -H "Content-Type: application/sparql-query" -H "Accept: application/sparql-results+json" \
--data 'SELECT (COUNT(*) AS ?triples) WHERE { ?s ?p ?o }' > /root/blaze_count/${SNAP_NAME}.json
# Extract triple count from JSON
triples=$(grep -oP '"value"\s*:\s*"\K[0-9]+' /root/blaze_count/${SNAP_NAME}.json || echo "unknown")
if [[ "$triples" =~ ^[0-9]+$ ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" -d chat_id="${TELEGRAM_CHAT_ID}" -d parse_mode="Markdown" \
-d text="Blazegraph snapshot *${SNAP_NAME}*. Triple count at the source: \`${triples}\`" >/dev/null
else
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" -d chat_id="${TELEGRAM_CHAT_ID}" -d parse_mode="Markdown" \
-d text="❌ Could not get triple count at the source for *${SNAP_NAME}*" >/dev/null
cat /root/blaze_count/${SNAP_NAME}.json
fi
systemctl stop blazegraph
sleep 5
notify_telegram "creating snapshot..."
zfs snapshot ${SRC_POOL}@${SNAP_NAME}
notify_telegram "snapshot created"
notify_telegram "starting blazegraph and otnode..."
systemctl start blazegraph
sleep 5
systemctl start otnode
if ! timeout 60 bash -c 'until journalctl -u otnode.service -n 200 | grep -q "Node is up and running!"; do sleep 1; done'; then
notify_telegram "❌ OT-Node failed to start in time"
exit 1
else
notify_telegram "*OT-Node is up and running*"
fi
# Get latest local snapshot
latest_snap=$(zfs list -t snapshot -o name -s creation -H | grep "^${SRC_POOL}@" | tail -n1)
# Get latest snapshot on remote
remote_snap=$(ssh root@${DST_HOST} "zfs list -t snapshot -o name -s creation -H | grep '^${DST_POOL}@' | tail -n1")
# Extract snapshot names (e.g., bk_2025-05-15_234533)
latest_name=${latest_snap#${SRC_POOL}@}
remote_name=${remote_snap#${DST_POOL}@}
echo -e "Source:\t\t $latest_snap"
echo "Destination has: $remote_snap"
#ensure remote side is on latest local snapshot, it's is not as starting blaze for validation changes the data
ssh root@${DST_HOST} "zfs rollback ${DST_POOL}@${remote_snap#${DST_POOL}@}"
notify_telegram "sending snapshot..."
# Send snapshot
if [[ -z "$remote_name" ]]; then
echo "No matching snapshot on remote. Sending full snapshot..."
zfs send -c "${latest_snap}" | ssh root@${DST_HOST} zfs receive -F ${DST_POOL}
else
echo "Sending incremental snapshot from ${remote_name}..."
zfs send -c -i "${SRC_POOL}@${remote_name}" "${latest_snap}" | ssh root@${DST_HOST} zfs receive ${DST_POOL}
fi
#validate jnl on the remote host
notify_telegram "validating snapshot at the destination..."
ssh root@${DST_HOST} DST_POOL="$DST_POOL" SNAP_NAME="$SNAP_NAME" TELEGRAM_BOT_TOKEN="$TELEGRAM_BOT_TOKEN" TELEGRAM_CHAT_ID="$TELEGRAM_CHAT_ID" 'bash -s' <<'EOF'
set -euo pipefail
echo "Starting Blazegraph..."
systemctl start blazegraph.service
sleep 5
echo "Running validation query..."
mkdir -p /root/blaze_count
curl --fail -s -X POST http://localhost:9999/blazegraph/namespace/dkg/sparql -H "Content-Type: application/sparql-query" -H "Accept: application/sparql-results+json" \
--data 'SELECT (COUNT(*) AS ?triples) WHERE { ?s ?p ?o }' > /root/blaze_count/${SNAP_NAME}.json
# Extract triple count from JSON
triples=$(grep -oP '"value"\s*:\s*"\K[0-9]+' /root/blaze_count/${SNAP_NAME}.json || echo "unknown")
echo $triples
# Report and stop
if [[ "$triples" =~ ^[0-9]+$ ]]; then
echo "✅ Blazegraph snapshot is healthy: ${triples} triples"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" -d chat_id="${TELEGRAM_CHAT_ID}" -d parse_mode="Markdown" \
-d text="✅ Blazegraph snapshot *${SNAP_NAME}* verified. Triple count: \`${triples}\`" >/dev/null
else
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" -d chat_id="${TELEGRAM_CHAT_ID}" -d parse_mode="Markdown" \
-d text="❌ Could not get triple count at the destination for *${SNAP_NAME}*" >/dev/null
cat /root/blaze_count/${SNAP_NAME}.json
# Extract first 4000 characters (safe margin) and escape backticks
payload=$(head -c 4000 /root/blaze_count/${SNAP_NAME}.json | sed 's/`/\\`/g')
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d parse_mode="Markdown" \
-d text="❌ *Blazegraph validation failed* for *${SNAP_NAME}*.\n\n\`\`\`\n${payload}\n\`\`\`" >/dev/null
exit 1
fi
echo "Stopping Blazegraph..."
systemctl stop blazegraph.service
EOF
# Keep only the 14 most recent snapshots
zfs list -t snapshot -o name -s creation -H | grep "^${SRC_POOL}@" | head -n -14 | xargs -r -n1 zfs destroy
ssh root@"$DST_HOST" "zfs list -t snapshot -o name -s creation -H | grep '^${DST_POOL}@' | head -n -14 | xargs -r -n1 zfs destroy"
notify_telegram "✅ *ZFS backup completed successfully* on $(hostname) at $(date)"