User:AnomieBOT/source/tasks/LeftBehindEnDashRedirectNotifier.pm

From Wikipedia, the free encyclopedia

package tasks::LeftBehindEnDashRedirectNotifier;
use parent 'AnomieBOT::Task';

=pod

=begin metadata

Bot:      AnomieBOT
Task:     LeftBehindEnDashRedirectNotifier
BRFA:     N/A
Status:   Begun 2025-09-22
Created:  2025-09-22

List redirects created by EnDashRedirectCreator that seem like they were incorrectly not G8-ed.

=end metadata

=cut

use utf8;
use strict;

use Time::HiRes;
use tasks::EnDashRedirectCreator;

# False positives, mainly due to people doing weird moves that result in the redirect being at a title the bot wouldn't have created.
my %skip = (
    'Kick-Ass: The New Girl' => 1,
    'Talk:Kick-Ass: The New Girl' => 1,
    'Talk:Sunan al-Nasa\'i' => 1,
    'Talk:Al-Shushtari' => 1,
);

sub new {
    my $class = shift;
    my $self = $class->SUPER::new();
    $self->{'order'} = 1000;
    bless $self, $class;
    return $self;
}

=pod

=for info
Per [[WP:BOTUSERSPACE]], any bot or automated editing process that affects only
the operator's or their own userspace, and which are not otherwise disruptive,
may be run without prior approval.

=cut

sub approved {
    return 999;
}

sub run {
    my ($self, $api) = @_;

    $api->task('LeftBehindEnDashRedirectNotifier', 0, 10, qw/d::IWNS d::Talk/);

    my %rns = $api->namespace_reverse_map();

    my ($dbh);
    eval {
        ($dbh) = $api->connectToReplica( 'enwiki' );
    };
    if ( $@ ) {
        $api->warn( "Error connecting to replica: $@\n" );
        return 300;
    }

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    $dbh->do( q{SET NAMES 'utf8'} );

    my $actorIds;
    eval {
        $actorIds = join( ',', @{ $dbh->selectcol_arrayref( "SELECT actor_id FROM actor_user WHERE actor_name = 'AnomieBOT'" ) } );
    };
    if ( $@ ) {
        $api->warn( "Error fetching actor ID from replica: $@\n" );
        return 300;
    }

    my $targetid;
    eval {
        ( $targetid ) = $dbh->selectrow_array( "SELECT lt_id FROM linktarget WHERE lt_namespace=2 AND lt_title = 'AnomieBOT/Auto-G8'" );
    };
    if ( $@ ) {
        $api->warn( "Error fetching linktarget ID from replica: $@\n" );
        return 300;
    }

    my $skipNs = join( ',', @tasks::EnDashRedirectCreator::skipNs );
    my $dashstr = join( '', keys %tasks::EnDashRedirectCreator::dashes );
    my $dashcond = join( ' OR ', map { "p1.page_title LIKE '%$_%'" } keys %tasks::EnDashRedirectCreator::dashes );
    my $dashcond2 = join( ' OR ', map { "p2.page_title LIKE '%$_%'" } keys %tasks::EnDashRedirectCreator::dashes );
    my $revreplcond = join( ' OR ', map { "REPLACE(p1.page_title,'-','$_') = p2.page_title" } keys %tasks::EnDashRedirectCreator::dashes );

    ## First case: Someone removed the Auto-G8 template.
    my $t0 = Time::HiRes::time();
    my @rows;
    eval {
        @rows = @{ $dbh->selectall_arrayref( qq{
            SET STATEMENT max_statement_time=300 FOR
            SELECT p1.page_namespace AS ns, p1.page_title AS title
            FROM logging_userindex
                JOIN comment_logging ON(log_comment_id=comment_id)
                JOIN page AS p1 ON(log_page=page_id)
            WHERE
                -- AnomieBOT page creations for the en-dash task.
                log_actor IN ($actorIds)
                AND log_type='create' AND log_action='create'
                AND comment_text LIKE '%en-dash%'

                -- Ignore pages moved to namespaces we don't handle.
                AND page_namespace NOT IN ($skipNs)

                -- Only check redirects.
                AND page_is_redirect = 1

                -- Ignore pages moved to titles the bot definitely wouldn't work with, i.e. no ASCII dash or has an en-dash.
                AND page_title LIKE '%-%' AND NOT ($dashcond)

                -- Ignore pages with the Auto-G8 template, maybe the auto-g8 is currently active.
                AND NOT EXISTS( SELECT 1 FROM templatelinks WHERE tl_from=log_page AND tl_target_id=$targetid )

                -- Ignore pages where an en-dashed title still exists.
                -- Most pages have only one dash, so we can check for the corresponding title efficiently.
                AND NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($revreplcond) )
                -- The ones with more dashes need a much less efficient query to catch the case where the target has mixed dashes and en-dashes.
                AND (
                    p1.page_title NOT LIKE '%-%-%'
                    OR NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($dashcond2) AND REGEXP_REPLACE( CONVERT(p2.page_title USING utf8mb4), '[$dashstr]', '-' ) = p1.page_title )
                )
        }, { Slice => {} } ) };
    };
    if ( $@ ) {
        $api->warn( "Error fetching first page list from replica: $@\n" );
        return 300;
    }
    my $t1 = Time::HiRes::time();
    $api->log( 'DB query took ' . ($t1-$t0) . ' seconds' );

    my @pages = ();
    for my $row (@rows) {
        utf8::decode( $row->{'title'} ); # Data from database is binary

        my $page = ( $row->{'ns'} ? $rns{$row->{'ns'}} . ':' : '' ) . $row->{'title'};
        $page =~ s/_/ /g;
        push @pages, $page unless exists( $skip{$page} );
    }

    if ( @pages ) {
        for my $page (sort @pages) {
            $api->whine( "Check en-dash redirect [[:$page]]", "It looks like {{-r|1=$page}} was created by the EnDashRedirectCreator task and should probably no longer exist (i.e. no corresponding en-dashed title seems to exist anymore), but instead of being deleted the [[User:AnomieBOT/Auto-G8]] template has gone missing. Please look into this." );
        }
    }

    ## Second case: Someone incorrectly changed the Auto-G8 template.
    $t0 = Time::HiRes::time();
    eval {
        @rows = @{ $dbh->selectall_arrayref( qq{
            SET STATEMENT max_statement_time=300 FOR
            SELECT p1.page_namespace AS ns, p1.page_title AS title
            FROM
                externallinks
                JOIN page AS p1 ON(el_from = page_id)
            WHERE
                -- Template is complaining about a mismatch
                el_to_domain_index = 'urn:.' AND el_to_path = 'x-anomiebot-auto-g8-mismatch:endash'

                -- Ignore pages moved to namespaces we don't handle.
                AND page_namespace NOT IN ($skipNs)

                -- Only check redirects.
                AND page_is_redirect = 1

                -- Ignore pages moved to titles the bot definitely wouldn't work with, i.e. no ASCII dash or has an en-dash.
                AND page_title LIKE '%-%' AND NOT ($dashcond)

                -- Ignore pages where an en-dashed title still exists.
                -- Most pages have only one dash, so we can check for the corresponding title efficiently.
                AND NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($revreplcond) )
                -- The ones with more dashes need a much less efficient query to catch the case where the target has mixed dashes and en-dashes.
                AND (
                    p1.page_title NOT LIKE '%-%-%'
                    OR NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($dashcond2) AND REGEXP_REPLACE( CONVERT(p2.page_title USING utf8mb4), '[$dashstr]', '-' ) = p1.page_title )
                )
        }, { Slice => {} } ) };
    };
    if ( $@ ) {
        $api->warn( "Error fetching second page list from replica: $@\n" );
        return 300;
    }
    $t1 = Time::HiRes::time();
    $api->log( 'DB query took ' . ($t1-$t0) . ' seconds' );

    @pages = ();
    for my $row (@rows) {
        utf8::decode( $row->{'title'} ); # Data from database is binary

        my $page = ( $row->{'ns'} ? $rns{$row->{'ns'}} . ':' : '' ) . $row->{'title'};
        $page =~ s/_/ /g;
        push @pages, $page unless exists( $skip{$page} );
    }

    if ( @pages ) {
        for my $page (sort @pages) {
            $api->whine( "Check en-dash redirect [[:$page]]", "It looks like {{-r|1=$page}} was created or adopted by the EnDashRedirectCreator task and should probably no longer exist (i.e. no corresponding en-dashed title seems to exist anymore), but instead of being deleted the [[User:AnomieBOT/Auto-G8]] template's target has been incorrectly changed. Please look into this." );
        }
    }

    my $t = 86400 - ( time() % 86400 );
    return $t;
}

1;

Related Articles

Wikiwand AI