PNG  IHDR pHYs   OiCCPPhotoshop ICC profilexڝSgTS=BKKoR RB&*! J!QEEȠQ, !{kּ> H3Q5 B.@ $pd!s#~<<+"x M0B\t8K@zB@F&S`cbP-`'{[! eDh;VEX0fK9-0IWfH  0Q){`##xFW<+*x<$9E[-qWW.(I+6aa@.y24x6_-"bbϫp@t~,/;m%h^ uf@Wp~<5j>{-]cK'Xto(hw?G%fIq^D$.Tʳ?D*A, `6B$BB dr`)B(Ͱ*`/@4Qhp.U=pa( Aa!ڈbX#!H$ ɈQ"K5H1RT UH=r9\F;2G1Q= C7F dt1r=6Ыhڏ>C03l0.B8, c˱" VcϱwE 6wB aAHXLXNH $4 7 Q'"K&b21XH,#/{C7$C2'ITFnR#,4H#dk9, +ȅ3![ b@qS(RjJ4e2AURݨT5ZBRQ4u9̓IKhhitݕNWGw Ljg(gwLӋT071oUX**| J&*/Tު UUT^S}FU3S ԖUPSSg;goT?~YYLOCQ_ cx,!k u5&|v*=9C3J3WRf?qtN (~))4L1e\kXHQG6EYAJ'\'GgSSݧ M=:.kDwn^Loy}/TmG X $ <5qo</QC]@Caaᄑ.ȽJtq]zۯ6iܟ4)Y3sCQ? 0k߬~OCOg#/c/Wװwa>>r><72Y_7ȷOo_C#dz%gA[z|!?:eAAA!h쐭!ΑiP~aa~ 'W?pX15wCsDDDޛg1O9-J5*>.j<74?.fYXXIlK9.*6nl {/]py.,:@LN8A*%w% yg"/6шC\*NH*Mz쑼5y$3,幄'L Lݛ:v m2=:1qB!Mggfvˬen/kY- BTZ(*geWf͉9+̳ې7ᒶKW-X潬j9(xoʿܔĹdff-[n ڴ VE/(ۻCɾUUMfeI?m]Nmq#׹=TR+Gw- 6 U#pDy  :v{vg/jBFS[b[O>zG499?rCd&ˮ/~јѡ򗓿m|x31^VwwO| (hSЧc3- cHRMz%u0`:o_F@8N ' p @8N@8}' p '#@8N@8N pQ9p!i~}|6-ӪG` VP.@*j>[ K^<֐Z]@8N'KQ<Q(`s" 'hgpKB`R@Dqj '  'P$a ( `D$Na L?u80e J,K˷NI'0eݷ(NI'؀ 2ipIIKp`:O'`ʤxB8Ѥx Ѥx $ $P6 :vRNb 'p,>NB 'P]-->P T+*^h& p '‰a ‰ (ĵt#u33;Nt̵'ޯ; [3W ~]0KH1q@8]O2]3*̧7# *p>us p _6]/}-4|t'|Smx= DoʾM×M_8!)6lq':l7!|4} '\ne t!=hnLn (~Dn\+‰_4k)0e@OhZ`F `.m1} 'vp{F`ON7Srx 'D˸nV`><;yMx!IS钦OM)Ե٥x 'DSD6bS8!" ODz#R >S8!7ّxEh0m$MIPHi$IvS8IN$I p$O8I,sk&I)$IN$Hi$I^Ah.p$MIN$IR8I·N "IF9Ah0m$MIN$IR8IN$I 3jIU;kO$ɳN$+ q.x* tEXtComment

Viewing File: /opt/cloudlinux/venv/lib/python3.11/site-packages/wmt/common/report.py

#!/opt/cloudlinux/venv/bin/python3

import re
from urllib.parse import urlparse
from dataclasses import dataclass, asdict
from sqlalchemy import func
from sqlalchemy.sql.expression import literal_column
from wmt.common import cfg
from wmt.common.utils import get_domains

from wmt.db import ScrapeResult, session_scope


@dataclass
class SummaryReport:
    count_all: int
    count_successful: int
    count_failed: int
    count_undone: int
    average_time: float

    def to_template(self, *args):
        return [self.count_all,
                self.count_successful,
                self.count_failed,
                self.count_undone,
                # emails need time in ms
                int(self.average_time / 10**3)]


@dataclass
class ErrorReport:
    code: str
    count_errors: int
    url: str

    def to_template(self, alternative):
        url = url_to_domain(self.url)
        if alternative == 'html':
            url = f'<a href="{self.url}">{url}</a>'
        return [url,
                self.count_errors,
                self.code]


@dataclass
class DurationReport:
    url: str
    average_time: float

    def to_template(self, alternative):
        url = url_to_domain(self.url)
        if alternative == 'html':
            url = f'<a href="{self.url}">{url}</a>'
        return [url,
                # emails need time in ms
                int(self.average_time / 10**3)]


def url_to_domain(url):
    pattern = r'http(s)?://'
    return re.sub(pattern, '', url)


def generate_report(engine, start_date, end_date):
    with session_scope(engine) as session:
        # gets counter per status code per website -> group key: website: status_code pair
        # e.g (test.com 404 3), (test.com 500 2)
        subquery = session.query(ScrapeResult.response_code,
                                 ScrapeResult.website,
                                 func.count().label('err_count')).\
            filter(ScrapeResult.create_date >= start_date,
                   ScrapeResult.create_date <= end_date,
                   ScrapeResult.response_code != 200,
                   ScrapeResult.is_finished == True)\
            .group_by(ScrapeResult.response_code, ScrapeResult.website)\
            .subquery()

        # group previous subquery by website
        # code     count  website
        # [('451,500', 3, 'http://www.flightradar24.com'),
        #  ('404', 2, 'http://broken.com')]
        error_stats = session.query(func.group_concat(subquery.c.response_code),
                                    func.sum(subquery.c.err_count),
                                    subquery.c.website)\
            .group_by(subquery.c.website)\
            .all()
        #  website                          avg ms             count
        # [('http://www.stackoverflow.com', 538.0816599732262, 2241),
        # ('http://www.suser.com',          66.53859883980365, 2241)]
        success_stats = session.query(ScrapeResult.website,
                                      func.avg(ScrapeResult.response_time_ms).label('average_time'),
                                      func.count())\
            .filter(ScrapeResult.create_date >= start_date,
                    ScrapeResult.create_date <= end_date,
                    ScrapeResult.response_code == 200)\
            .group_by(ScrapeResult.website)\
            .order_by(literal_column('average_time').desc()) \
            .all()

        count_unsuccessful = session.query(ScrapeResult)\
            .filter(ScrapeResult.create_date >= start_date,
                    ScrapeResult.create_date <= end_date,
                    ScrapeResult.is_finished == False)\
            .count()

    # filter out not present domains using host-only comparison and tolerating www/non-www
    def extract_hostname_only(url_value: str):
        parsed = urlparse(url_value, 'http')
        hostname = parsed.netloc or parsed.path
        return hostname.lower().strip('/')

    allowed_hosts = set()
    for domain in get_domains():
        hostname = extract_hostname_only(domain)
        allowed_hosts.add(hostname)
        # include both www and non-www forms for matching
        if hostname.startswith('www.'):
            allowed_hosts.add(hostname[4:])
        else:
            allowed_hosts.add('www.' + hostname)

    success_stats = [
        (url, average_time, count)
        for url, average_time, count in success_stats
        if not cfg.is_domain_ignored(url) and extract_hostname_only(url) in allowed_hosts
    ]

    error_stats = [
        (code, count, url)
        for code, count, url in error_stats
        if not cfg.is_domain_ignored(url) and extract_hostname_only(url) in allowed_hosts
    ]

    error_report = [ErrorReport(code=code, count_errors=count_errors, url=url)
                    for code, count_errors, url in error_stats]

    # Collapse by host: prefer https entry if it exists for the host; otherwise keep the first
    # implemented because in report we do not want to show both http and https entries for the same host
    host_choice = {}
    for url, average_time, _ in success_stats:
        host = extract_hostname_only(url)
        is_https = str(url).lower().startswith('https://')
        chosen = host_choice.get(host)
        if chosen is None:
            host_choice[host] = (url, average_time, is_https)
        else:
            # upgrade to https if available
            if not chosen[2] and is_https:
                host_choice[host] = (url, average_time, True)

    duration_report = [DurationReport(url=item[0], average_time=int(round(item[1] * 1000)))
                       for item in host_choice.values()]

    successful_requests_count = sum(success_count for url, _, success_count in success_stats)
    error_requests_count = sum(errors_count for _, errors_count, url in error_stats)

    averages = [item[1] for item in success_stats]
    average_count = 0 if not averages else int(round(1000 * sum(averages) / len(averages)))
    summary_report = SummaryReport(count_all=successful_requests_count + error_requests_count + count_unsuccessful,
                                   count_successful=successful_requests_count,
                                   count_failed=error_requests_count,
                                   count_undone=count_unsuccessful,
                                   average_time=average_count)
    return {
        'summary_report': summary_report,
        'error_report': error_report,
        'duration_report': duration_report
    }


def report_dict(report):
    return {
        'summary_report': asdict(report['summary_report']),
        'error_report': [asdict(item) for item in report['error_report']],
        'duration_report': [asdict(item) for item in report['duration_report']]
    }
Back to Directory=ceiIENDB`