User:AnomieBOT/source/tasks/LeftBehindEnDashRedirectNotifier.pm
From Wikipedia, the free encyclopedia
| Per WP:BOTUSERSPACE, any bot or automated editing process that affects only the operator's or their own userspace, and which are not otherwise disruptive, may be run without prior approval. |
package tasks::LeftBehindEnDashRedirectNotifier;
use parent 'AnomieBOT::Task';
=pod
=begin metadata
Bot: AnomieBOT
Task: LeftBehindEnDashRedirectNotifier
BRFA: N/A
Status: Begun 2025-09-22
Created: 2025-09-22
List redirects created by EnDashRedirectCreator that seem like they were incorrectly not G8-ed.
=end metadata
=cut
use utf8;
use strict;
use Time::HiRes;
use tasks::EnDashRedirectCreator;
# False positives, mainly due to people doing weird moves that result in the redirect being at a title the bot wouldn't have created.
my %skip = (
'Kick-Ass: The New Girl' => 1,
'Talk:Kick-Ass: The New Girl' => 1,
'Talk:Sunan al-Nasa\'i' => 1,
'Talk:Al-Shushtari' => 1,
);
sub new {
my $class = shift;
my $self = $class->SUPER::new();
$self->{'order'} = 1000;
bless $self, $class;
return $self;
}
=pod
=for info
Per [[WP:BOTUSERSPACE]], any bot or automated editing process that affects only
the operator's or their own userspace, and which are not otherwise disruptive,
may be run without prior approval.
=cut
sub approved {
return 999;
}
sub run {
my ($self, $api) = @_;
$api->task('LeftBehindEnDashRedirectNotifier', 0, 10, qw/d::IWNS d::Talk/);
my %rns = $api->namespace_reverse_map();
my ($dbh);
eval {
($dbh) = $api->connectToReplica( 'enwiki' );
};
if ( $@ ) {
$api->warn( "Error connecting to replica: $@\n" );
return 300;
}
# Spend a max of 5 minutes on this task before restarting
my $endtime=time()+300;
$dbh->do( q{SET NAMES 'utf8'} );
my $actorIds;
eval {
$actorIds = join( ',', @{ $dbh->selectcol_arrayref( "SELECT actor_id FROM actor_user WHERE actor_name = 'AnomieBOT'" ) } );
};
if ( $@ ) {
$api->warn( "Error fetching actor ID from replica: $@\n" );
return 300;
}
my $targetid;
eval {
( $targetid ) = $dbh->selectrow_array( "SELECT lt_id FROM linktarget WHERE lt_namespace=2 AND lt_title = 'AnomieBOT/Auto-G8'" );
};
if ( $@ ) {
$api->warn( "Error fetching linktarget ID from replica: $@\n" );
return 300;
}
my $skipNs = join( ',', @tasks::EnDashRedirectCreator::skipNs );
my $dashstr = join( '', keys %tasks::EnDashRedirectCreator::dashes );
my $dashcond = join( ' OR ', map { "p1.page_title LIKE '%$_%'" } keys %tasks::EnDashRedirectCreator::dashes );
my $dashcond2 = join( ' OR ', map { "p2.page_title LIKE '%$_%'" } keys %tasks::EnDashRedirectCreator::dashes );
my $revreplcond = join( ' OR ', map { "REPLACE(p1.page_title,'-','$_') = p2.page_title" } keys %tasks::EnDashRedirectCreator::dashes );
## First case: Someone removed the Auto-G8 template.
my $t0 = Time::HiRes::time();
my @rows;
eval {
@rows = @{ $dbh->selectall_arrayref( qq{
SET STATEMENT max_statement_time=300 FOR
SELECT p1.page_namespace AS ns, p1.page_title AS title
FROM logging_userindex
JOIN comment_logging ON(log_comment_id=comment_id)
JOIN page AS p1 ON(log_page=page_id)
WHERE
-- AnomieBOT page creations for the en-dash task.
log_actor IN ($actorIds)
AND log_type='create' AND log_action='create'
AND comment_text LIKE '%en-dash%'
-- Ignore pages moved to namespaces we don't handle.
AND page_namespace NOT IN ($skipNs)
-- Only check redirects.
AND page_is_redirect = 1
-- Ignore pages moved to titles the bot definitely wouldn't work with, i.e. no ASCII dash or has an en-dash.
AND page_title LIKE '%-%' AND NOT ($dashcond)
-- Ignore pages with the Auto-G8 template, maybe the auto-g8 is currently active.
AND NOT EXISTS( SELECT 1 FROM templatelinks WHERE tl_from=log_page AND tl_target_id=$targetid )
-- Ignore pages where an en-dashed title still exists.
-- Most pages have only one dash, so we can check for the corresponding title efficiently.
AND NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($revreplcond) )
-- The ones with more dashes need a much less efficient query to catch the case where the target has mixed dashes and en-dashes.
AND (
p1.page_title NOT LIKE '%-%-%'
OR NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($dashcond2) AND REGEXP_REPLACE( CONVERT(p2.page_title USING utf8mb4), '[$dashstr]', '-' ) = p1.page_title )
)
}, { Slice => {} } ) };
};
if ( $@ ) {
$api->warn( "Error fetching first page list from replica: $@\n" );
return 300;
}
my $t1 = Time::HiRes::time();
$api->log( 'DB query took ' . ($t1-$t0) . ' seconds' );
my @pages = ();
for my $row (@rows) {
utf8::decode( $row->{'title'} ); # Data from database is binary
my $page = ( $row->{'ns'} ? $rns{$row->{'ns'}} . ':' : '' ) . $row->{'title'};
$page =~ s/_/ /g;
push @pages, $page unless exists( $skip{$page} );
}
if ( @pages ) {
for my $page (sort @pages) {
$api->whine( "Check en-dash redirect [[:$page]]", "It looks like {{-r|1=$page}} was created by the EnDashRedirectCreator task and should probably no longer exist (i.e. no corresponding en-dashed title seems to exist anymore), but instead of being deleted the [[User:AnomieBOT/Auto-G8]] template has gone missing. Please look into this." );
}
}
## Second case: Someone incorrectly changed the Auto-G8 template.
$t0 = Time::HiRes::time();
eval {
@rows = @{ $dbh->selectall_arrayref( qq{
SET STATEMENT max_statement_time=300 FOR
SELECT p1.page_namespace AS ns, p1.page_title AS title
FROM
externallinks
JOIN page AS p1 ON(el_from = page_id)
WHERE
-- Template is complaining about a mismatch
el_to_domain_index = 'urn:.' AND el_to_path = 'x-anomiebot-auto-g8-mismatch:endash'
-- Ignore pages moved to namespaces we don't handle.
AND page_namespace NOT IN ($skipNs)
-- Only check redirects.
AND page_is_redirect = 1
-- Ignore pages moved to titles the bot definitely wouldn't work with, i.e. no ASCII dash or has an en-dash.
AND page_title LIKE '%-%' AND NOT ($dashcond)
-- Ignore pages where an en-dashed title still exists.
-- Most pages have only one dash, so we can check for the corresponding title efficiently.
AND NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($revreplcond) )
-- The ones with more dashes need a much less efficient query to catch the case where the target has mixed dashes and en-dashes.
AND (
p1.page_title NOT LIKE '%-%-%'
OR NOT EXISTS( SELECT 1 FROM page AS p2 WHERE p2.page_namespace=p1.page_namespace AND ($dashcond2) AND REGEXP_REPLACE( CONVERT(p2.page_title USING utf8mb4), '[$dashstr]', '-' ) = p1.page_title )
)
}, { Slice => {} } ) };
};
if ( $@ ) {
$api->warn( "Error fetching second page list from replica: $@\n" );
return 300;
}
$t1 = Time::HiRes::time();
$api->log( 'DB query took ' . ($t1-$t0) . ' seconds' );
@pages = ();
for my $row (@rows) {
utf8::decode( $row->{'title'} ); # Data from database is binary
my $page = ( $row->{'ns'} ? $rns{$row->{'ns'}} . ':' : '' ) . $row->{'title'};
$page =~ s/_/ /g;
push @pages, $page unless exists( $skip{$page} );
}
if ( @pages ) {
for my $page (sort @pages) {
$api->whine( "Check en-dash redirect [[:$page]]", "It looks like {{-r|1=$page}} was created or adopted by the EnDashRedirectCreator task and should probably no longer exist (i.e. no corresponding en-dashed title seems to exist anymore), but instead of being deleted the [[User:AnomieBOT/Auto-G8]] template's target has been incorrectly changed. Please look into this." );
}
}
my $t = 86400 - ( time() % 86400 );
return $t;
}
1;