ok

Mini Shell

Direktori : /opt/imunify360-webshield/lualib/resty/upstream/
Upload File :
Current File : //opt/imunify360-webshield/lualib/resty/upstream/healthcheck.lua

local stream_sock = ngx.socket.tcp
local log = ngx.log
local ERR = ngx.ERR
local WARN = ngx.WARN
local DEBUG = ngx.DEBUG
local ngx = ngx
local error = error
local string = string
local sub = string.sub
local re_find = ngx.re.find
local new_timer = ngx.timer.at
local shared = ngx.shared
local debug_mode = ngx.config.debug
local concat = table.concat
local tonumber = tonumber
local tostring = tostring
local ipairs = ipairs
local ceil = math.ceil
local spawn = ngx.thread.spawn
local wait = ngx.thread.wait
local pcall = pcall
local setmetatable = setmetatable

-- LuaFormatter off
local _M = {
    _VERSION = '0.08'
}

if not ngx.config
   or not ngx.config.ngx_lua_version
   or ngx.config.ngx_lua_version < 9005
then
    error("ngx_lua 0.9.5+ required")
end
-- LuaFormatter on

local ok, upstream = pcall(require, "ngx.upstream")
if not ok then
    error("ngx_upstream_lua module required")
end

local ok, new_tab = pcall(require, "table.new")
if not ok or type(new_tab) ~= "function" then
    new_tab = function(narr, nrec)
        return {}
    end
end

local set_peer_down = upstream.set_peer_down
local get_primary_peers = upstream.get_primary_peers
local get_backup_peers = upstream.get_backup_peers
local get_upstreams = upstream.get_upstreams

local upstream_checker_statuses = {}

local function warn(...)
    log(WARN, "healthcheck: ", ...)
end

local function errlog(...)
    log(ERR, "healthcheck: ", ...)
end

local function debug(...)
    -- print("debug mode: ", debug_mode)
    if debug_mode then
        log(DEBUG, "healthcheck: ", ...)
    end
end

local function gen_peer_key(prefix, u, is_backup, id)
    if is_backup then
        return prefix .. u .. ":b" .. id
    end
    return prefix .. u .. ":p" .. id
end

local function set_peer_down_globally(ctx, is_backup, id, value)
    local u = ctx.upstream
    local dict = ctx.dict
    local ok, err = set_peer_down(u, is_backup, id, value)
    if not ok then
        errlog("failed to set peer down: ", err)
    end

    if not ctx.new_version then
        ctx.new_version = true
    end

    local key = gen_peer_key("d:", u, is_backup, id)
    local ok, err = dict:set(key, value)
    if not ok then
        errlog("failed to set peer down state: ", err)
    end
end

local function peer_fail(ctx, is_backup, id, peer)
    debug("peer ", peer.name, " was checked to be not ok")

    local u = ctx.upstream
    local dict = ctx.dict

    local key = gen_peer_key("nok:", u, is_backup, id)
    local fails, err = dict:get(key)
    if not fails then
        if err then
            errlog("failed to get peer nok key: ", err)
            return
        end
        fails = 1

        -- below may have a race condition, but it is fine for our
        -- purpose here.
        local ok, err = dict:set(key, 1)
        if not ok then
            errlog("failed to set peer nok key: ", err)
        end
    else
        fails = fails + 1
        local ok, err = dict:incr(key, 1)
        if not ok then
            errlog("failed to incr peer nok key: ", err)
        end
    end

    if fails == 1 then
        key = gen_peer_key("ok:", u, is_backup, id)
        local succ, err = dict:get(key)
        if not succ or succ == 0 then
            if err then
                errlog("failed to get peer ok key: ", err)
                return
            end
        else
            local ok, err = dict:set(key, 0)
            if not ok then
                errlog("failed to set peer ok key: ", err)
            end
        end
    end

    -- print("ctx fall: ", ctx.fall, ", peer down: ", peer.down,
    -- ", fails: ", fails)

    if not peer.down and fails >= ctx.fall then
        warn("peer ", peer.name, " is turned down after ", fails, " failure(s)")
        peer.down = true
        set_peer_down_globally(ctx, is_backup, id, true)
    end
end

local function peer_ok(ctx, is_backup, id, peer)
    debug("peer ", peer.name, " was checked to be ok")

    local u = ctx.upstream
    local dict = ctx.dict

    local key = gen_peer_key("ok:", u, is_backup, id)
    local succ, err = dict:get(key)
    if not succ then
        if err then
            errlog("failed to get peer ok key: ", err)
            return
        end
        succ = 1

        -- below may have a race condition, but it is fine for our
        -- purpose here.
        local ok, err = dict:set(key, 1)
        if not ok then
            errlog("failed to set peer ok key: ", err)
        end
    else
        succ = succ + 1
        local ok, err = dict:incr(key, 1)
        if not ok then
            errlog("failed to incr peer ok key: ", err)
        end
    end

    if succ == 1 then
        key = gen_peer_key("nok:", u, is_backup, id)
        local fails, err = dict:get(key)
        if not fails or fails == 0 then
            if err then
                errlog("failed to get peer nok key: ", err)
                return
            end
        else
            local ok, err = dict:set(key, 0)
            if not ok then
                errlog("failed to set peer nok key: ", err)
            end
        end
    end

    if peer.down and succ >= ctx.rise then
        warn("peer ", peer.name, " is turned up after ", succ, " success(es)")
        peer.down = nil
        set_peer_down_globally(ctx, is_backup, id, nil)
    end
end

-- shortcut error function for check_peer()
local function peer_error(ctx, is_backup, id, peer, ...)
    if not peer.down then
        errlog(...)
    end
    peer_fail(ctx, is_backup, id, peer)
end

local function check_peer(ctx, id, peer, is_backup)
    local ok
    local name = peer.name
    local statuses = ctx.statuses
    local req = ctx.http_req

    local sock, err = stream_sock()
    if not sock then
        errlog("failed to create stream socket: ", err)
        return
    end

    sock:settimeout(ctx.timeout)

    if peer.host then
        -- print("peer port: ", peer.port)
        ok, err = sock:connect(peer.host, peer.port)
    else
        ok, err = sock:connect(name)
    end
    if not ok then
        if not peer.down then
            errlog("failed to connect to ", name, ": ", err)
        end
        return peer_fail(ctx, is_backup, id, peer)
    end

    if ctx.type == "https" then
        ok, err = sock:sslhandshake(nil, ctx.host, ctx.ssl_verify)
        if not ok then
            sock:close()
            return peer_error(ctx, is_backup, id, peer,
                              "failed to ssl handshake to ", name, ": ", err)
        end
    end

    local bytes, err = sock:send(req)
    if not bytes then
        return peer_error(ctx, is_backup, id, peer,
                          "failed to send request to ", name, ": ", err)
    end

    local status_line, err = sock:receive()
    if not status_line then
        peer_error(ctx, is_backup, id, peer,
                   "failed to receive status line from ", name, ": ", err)
        if err == "timeout" then
            sock:close() -- timeout errors do not close the socket.
        end
        return
    end

    if statuses then
        local from, to, err = re_find(status_line, [[^HTTP/\d+\.\d+\s+(\d+)]],
                                      "joi", nil, 1)
        if err then
            errlog("failed to parse status line: ", err)
        end

        if not from then
            peer_error(ctx, is_backup, id, peer, "bad status line from ", name,
                       ": ", status_line)
            sock:close()
            return
        end

        local status = tonumber(sub(status_line, from, to))
        if not statuses[status] then
            peer_error(ctx, is_backup, id, peer, "bad status code from ", name,
                       ": ", status)
            sock:close()
            return
        end
    end

    peer_ok(ctx, is_backup, id, peer)
    sock:close()
end

local function check_peer_range(ctx, from, to, peers, is_backup)
    for i = from, to do
        check_peer(ctx, i - 1, peers[i], is_backup)
    end
end

local function check_peers(ctx, peers, is_backup)
    local n = #peers
    if n == 0 then
        return
    end

    local concur = ctx.concurrency
    if concur <= 1 then
        for i = 1, n do
            check_peer(ctx, i - 1, peers[i], is_backup)
        end
    else
        local threads
        local nthr

        if n <= concur then
            nthr = n - 1
            threads = new_tab(nthr, 0)
            for i = 1, nthr do

                if debug_mode then
                    debug("spawn a thread checking ",
                          is_backup and "backup" or "primary", " peer ", i - 1)
                end

                threads[i] = spawn(check_peer, ctx, i - 1, peers[i], is_backup)
            end
            -- use the current "light thread" to run the last task
            if debug_mode then
                debug("check ", is_backup and "backup" or "primary", " peer ",
                      n - 1)
            end
            check_peer(ctx, n - 1, peers[n], is_backup)

        else
            local group_size = ceil(n / concur)
            nthr = ceil(n / group_size) - 1

            threads = new_tab(nthr, 0)
            local from = 1
            local rest = n
            for i = 1, nthr do
                local to
                if rest >= group_size then
                    rest = rest - group_size
                    to = from + group_size - 1
                else
                    rest = 0
                    to = from + rest - 1
                end

                if debug_mode then
                    debug("spawn a thread checking ",
                          is_backup and "backup" or "primary", " peers ",
                          from - 1, " to ", to - 1)
                end

                threads[i] = spawn(check_peer_range, ctx, from, to, peers,
                                   is_backup)
                from = from + group_size
                if rest == 0 then
                    break
                end
            end
            if rest > 0 then
                local to = from + rest - 1

                if debug_mode then
                    debug("check ", is_backup and "backup" or "primary",
                          " peers ", from - 1, " to ", to - 1)
                end

                check_peer_range(ctx, from, to, peers, is_backup)
            end
        end

        if nthr and nthr > 0 then
            for i = 1, nthr do
                local t = threads[i]
                if t then
                    wait(t)
                end
            end
        end
    end
end

local function upgrade_peers_version(ctx, peers, is_backup)
    local dict = ctx.dict
    local u = ctx.upstream
    local n = #peers
    for i = 1, n do
        local peer = peers[i]
        local id = i - 1
        local key = gen_peer_key("d:", u, is_backup, id)
        local down = false
        local res, err = dict:get(key)
        if not res then
            if err then
                errlog("failed to get peer down state: ", err)
            end
        else
            down = true
        end
        if (peer.down and not down) or (not peer.down and down) then
            local ok, err = set_peer_down(u, is_backup, id, down)
            if not ok then
                errlog("failed to set peer down: ", err)
            else
                -- update our cache too
                peer.down = down
            end
        end
    end
end

local function check_peers_updates(ctx)
    local dict = ctx.dict
    local u = ctx.upstream
    local key = "v:" .. u
    local ver, err = dict:get(key)
    if not ver then
        if err then
            errlog("failed to get peers version: ", err)
            return
        end

        if ctx.version > 0 then
            ctx.new_version = true
        end

    elseif ctx.version < ver then
        debug("upgrading peers version to ", ver)
        upgrade_peers_version(ctx, ctx.primary_peers, false);
        upgrade_peers_version(ctx, ctx.backup_peers, true);
        ctx.version = ver
    end
end

local function get_lock(ctx)
    local dict = ctx.dict
    local key = "l:" .. ctx.upstream

    -- the lock is held for the whole interval to prevent multiple
    -- worker processes from sending the test request simultaneously.
    -- here we substract the lock expiration time by 1ms to prevent
    -- a race condition with the next timer event.
    local ok, err = dict:add(key, true, ctx.interval - 0.001)
    if not ok then
        if err == "exists" then
            return nil
        end
        errlog("failed to add key \"", key, "\": ", err)
        return nil
    end
    return true
end

local function do_check(ctx)
    debug("healthcheck: run a check cycle")

    check_peers_updates(ctx)

    if get_lock(ctx) then
        check_peers(ctx, ctx.primary_peers, false)
        check_peers(ctx, ctx.backup_peers, true)
    end

    if ctx.new_version then
        local key = "v:" .. ctx.upstream
        local dict = ctx.dict

        if debug_mode then
            debug("publishing peers version ", ctx.version + 1)
        end

        dict:add(key, 0)
        local new_ver, err = dict:incr(key, 1)
        if not new_ver then
            errlog("failed to publish new peers version: ", err)
        end

        ctx.version = new_ver
        ctx.new_version = nil
    end
end

local function update_upstream_checker_status(upstream, success)
    local cnt = upstream_checker_statuses[upstream]
    if not cnt then
        cnt = 0
    end

    if success then
        cnt = cnt + 1
    else
        cnt = cnt - 1
    end

    upstream_checker_statuses[upstream] = cnt
end

local check
check = function(premature, ctx)
    if premature then
        return
    end

    local ok, err = pcall(do_check, ctx)
    if not ok then
        errlog("failed to run healthcheck cycle: ", err)
    end

    local ok, err = new_timer(ctx.interval, check, ctx)
    if not ok then
        if err ~= "process exiting" then
            errlog("failed to create timer: ", err)
        end

        update_upstream_checker_status(ctx.upstream, false)
        return
    end
end

local function preprocess_peers(peers, port)
    local n = #peers
    for i = 1, n do
        local p = peers[i]
        local name = p.name

        if name then
            local from, to, err = re_find(name, [[^(.*):\d+$]], "jo", nil, 1)
            if from then
                p.host = sub(name, 1, to)
                p.port = port or tonumber(sub(name, to + 2))
            end
        end
    end
    return peers
end

function _M.spawn_checker(opts)
    local typ = opts.type
    if not typ then
        return nil, "\"type\" option required"
    end

    if typ ~= "http" and typ ~= "https" then
        return nil, "only \"http\" and \"https\" type are supported right now"
    end

    local ssl_verify = opts.ssl_verify
    if ssl_verify == nil then
        ssl_verify = true
    end

    local http_req = opts.http_req
    if not http_req then
        return nil, "\"http_req\" option required"
    end

    local timeout = opts.timeout
    if not timeout then
        timeout = 1000
    end

    local interval = opts.interval
    if not interval then
        interval = 1

    else
        interval = interval / 1000
        if interval < 0.002 then -- minimum 2ms
            interval = 0.002
        end
    end

    local valid_statuses = opts.valid_statuses
    local statuses
    if valid_statuses then
        statuses = new_tab(0, #valid_statuses)
        for _, status in ipairs(valid_statuses) do
            -- print("found good status ", status)
            statuses[status] = true
        end
    end

    -- debug("interval: ", interval)

    local concur = opts.concurrency
    if not concur then
        concur = 1
    end

    local fall = opts.fall
    if not fall then
        fall = 5
    end

    local rise = opts.rise
    if not rise then
        rise = 2
    end

    local shm = opts.shm
    if not shm then
        return nil, "\"shm\" option required"
    end

    local dict = shared[shm]
    if not dict then
        return nil, "shm \"" .. tostring(shm) .. "\" not found"
    end

    local u = opts.upstream
    if not u then
        return nil, "no upstream specified"
    end

    local ppeers, err = get_primary_peers(u)
    if not ppeers then
        return nil, "failed to get primary peers: " .. err
    end

    local bpeers, err = get_backup_peers(u)
    if not bpeers then
        return nil, "failed to get backup peers: " .. err
    end

    local ctx = {
        upstream = u,
        primary_peers = preprocess_peers(ppeers, opts.port),
        backup_peers = preprocess_peers(bpeers, opts.port),
        http_req = http_req,
        timeout = timeout,
        interval = interval,
        dict = dict,
        fall = fall,
        rise = rise,
        statuses = statuses,
        version = 0,
        concurrency = concur,
        type = typ,
        host = opts.host,
        ssl_verify = ssl_verify
    }

    if debug_mode and opts.no_timer then
        check(nil, ctx)

    else
        local ok, err = new_timer(0, check, ctx)
        if not ok then
            return nil, "failed to create timer: " .. err
        end
    end

    update_upstream_checker_status(u, true)

    return true
end

local new_status_meta = {
    __add = function(self, rhs)
        -- debug("new_status_meta:__add: rhs: ", rhs)
        self.statuses[self.idx] = rhs
        self.idx = self.idx + 1
    end
}
new_status_meta.__index = new_status_meta

function new_status_meta:add(rhs)
    self:__add(rhs)
end

local function new_status_table(n)
    local tab = {statuses = new_tab(n * 90, 0), idx = 1}
    return setmetatable(tab, new_status_meta)
end

-- combined upstream status adding functions

local function add_upstream_prometheus_status_line(tab, u, st)
    tab:add('nginx_upstream_status_info{name="')
    tab:add(u)
    tab:add('",status="')
    tab:add(st)
    tab:add('\n')
end

local function add_upstream_up_prometheus_status(tab, u)
    add_upstream_prometheus_status_line(tab, u, 'UP"} 1');
    add_upstream_prometheus_status_line(tab, u, 'DOWN"} 0');
    add_upstream_prometheus_status_line(tab, u, 'UNKNOWN"} 0');
end

local function add_upstream_down_prometheus_status(tab, u)
    add_upstream_prometheus_status_line(tab, u, 'UP"} 0');
    add_upstream_prometheus_status_line(tab, u, 'DOWN"} 1');
    add_upstream_prometheus_status_line(tab, u, 'UNKNOWN"} 0');
end

local function add_upstream_unknown_prometheus_status(tab, u)
    add_upstream_prometheus_status_line(tab, u, 'UP"} 0');
    add_upstream_prometheus_status_line(tab, u, 'DOWN"} 0');
    add_upstream_prometheus_status_line(tab, u, 'UNKNOWN"} 1');
end

-- peer status generator functions

local function gen_peer_prometheus_status(tab, u, p, r, s, n)
    tab:add("nginx_upstream_status_info{name=\"")
    tab:add(u)
    tab:add("\",endpoint=\"")
    tab:add(p)
    tab:add("\",status=\"")
    tab:add(s)
    tab:add("\",role=\"")
    tab:add(r)
    tab:add("\"} ")
    tab:add(n)
    tab:add("\n")
end

-- combined peer status adding function

local function add_peer_status(tab, u, p, r)
    gen_peer_prometheus_status(tab, u, p.name, r, "UP", not p.down and 1 or 0)
    gen_peer_prometheus_status(tab, u, p.name, r, "DOWN", p.down and 1 or 0)
end

local function add_peer_prometheus_status(tab, u, p, r)
    gen_peer_prometheus_status(tab, u, p.name, r, "UP", not p.down and 1 or 0)
    gen_peer_prometheus_status(tab, u, p.name, r, "DOWN", p.down and 1 or 0)
end

local function add_peers_info(tab, u, peers, role)
    local npeers = #peers
    for i = 1, npeers do
        local peer = peers[i]
        tab:add("        ")
        tab:add(peer.name)
        if peer.down then
            tab:add(" DOWN\n")
        else
            tab:add(" UP\n")
        end
    end
end

local function add_peers_prometheus_info(tab, u, peers, role)
    local npeers = #peers
    local found_up_peer = false
    for i = 1, npeers do
        add_peer_prometheus_status(tab, u, peers[i], role)
        if not peers[i].down then
            found_up_peer = true
        end
    end
    return found_up_peer
end

function _M.prometheus_status_page()
    -- generate an prometheus metrics
    -- # HELP nginx_upstream_status_info The running staus of nginx upstream
    -- # TYPE nginx_upstream_status_info gauge
    -- nginx_upstream_status_info{name="",endpoint="",status="",role=""} num

    local us, err = get_upstreams()
    if not us then
        return nil, "failed to get upstream names: " .. err
    end

    local n = #us

    local stats_tab = new_status_table(n)

    stats_tab:add(
        "# HELP nginx_upstream_status_info The running status of nginx upstream\n")
    stats_tab:add("# TYPE nginx_upstream_status_info gauge\n")

    for i = 1, n do
        local u = us[i]
        local ncheckers = upstream_checker_statuses[u]
        if not ncheckers or ncheckers == 0 then
            add_upstream_unknown_prometheus_status(stats_tab, u)
            goto continue
        end

        local peers, err = get_primary_peers(u)
        if not peers then
            add_upstream_down_prometheus_status(stats_tab, u)
        else
            local peers, err = get_primary_peers(u)
            local found_up_peer = false

            if peers then
                if add_peers_prometheus_info(stats_tab, u, peers, "PRIMARY") then
                    found_up_peer = true
                end
            end

            peers, err = get_backup_peers(u)
            if peers then
                if add_peers_prometheus_info(stats_tab, u, peers, "BACKUP") then
                    found_up_peer = true
                end
            end

            if found_up_peer then
                add_upstream_up_prometheus_status(stats_tab, u)
            else
                add_upstream_down_prometheus_status(stats_tab, u)
            end
        end
        ::continue::
    end

    return concat(stats_tab.statuses)
end

function _M.status_page()
    -- generate an HTML page
    local us, err = get_upstreams()
    if not us then
        return "failed to get upstream names: " .. err
    end

    local n = #us
    local stats_tab = new_status_table(n)

    for i = 1, n do
        if i > 1 then
            stats_tab:add("\n")
        end

        local u = us[i]

        stats_tab:add("Upstream ")
        stats_tab:add(u)

        local ncheckers = upstream_checker_statuses[u]
        if not ncheckers or ncheckers == 0 then
            stats_tab:add(" (NO checkers)")
        end

        stats_tab:add("\n    Primary Peers\n")

        local peers, err = get_primary_peers(u)
        if not peers then
            return
                "failed to get primary peers in upstream " .. u .. ": " .. err
        end

        add_peers_info(stats_tab, u, peers, "PRIMARY")

        stats_tab:add("    Backup Peers\n")

        peers, err = get_backup_peers(u)
        if not peers then
            return "failed to get backup peers in upstream " .. u .. ": " .. err
        end

        add_peers_info(stats_tab, u, peers, "BACKUP")
    end
    return concat(stats_tab.statuses)
end

return _M

Zerion Mini Shell 1.0