LinkTitles extension for MediaWiki
Automatically add links to existing pages.
 All Classes Files Functions Variables Modules Pages
LinkTitles.body.php
Go to the documentation of this file.
1 <?php
2 /*
3  * Copyright 2012-2014 Daniel Kraus <krada@gmx.net> ('bovender')
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18  * MA 02110-1301, USA.
19  */
21 
25  function dump($var) {
26  error_log(print_r($var, TRUE) . "\n", 3, 'php://stderr');
27  };
28 
31  class LinkTitles {
33  private static $currentTitle;
34 
36  private static $targetTitle;
37 
41  private static $targetContent;
42 
45  private static $targetTitleText;
46 
48  public static function setup() {
51  global $wgHooks;
52  if ( $wgLinkTitlesParseOnEdit ) {
53  $wgHooks['PageContentSave'][] = 'LinkTitles::onPageContentSave';
54  };
55  if ( $wgLinkTitlesParseOnRender ) {
56  $wgHooks['InternalParseBeforeLinks'][] = 'LinkTitles::onInternalParseBeforeLinks';
57  };
58  $wgHooks['GetDoubleUnderscoreIDs'][] = 'LinkTitles::onGetDoubleUnderscoreIDs';
59  }
60 
62  public static function onPageContentSave( &$wikiPage, &$user, &$content, &$summary,
63  $isMinor, $isWatch, $section, &$flags, &$status ) {
64 
65  if ( ! $isMinor ) {
66  $title = $wikiPage->getTitle();
67  $text = $content->getContentHandler()->serializeContent($content);
68  $newText = self::parseContent( $title, $text );
69  if ( $newText != $text ) {
70  $content = $content->getContentHandler()->unserializeContent( $newText );
71  }
72  };
73  return true;
74  }
75 
79  public static function onInternalParseBeforeLinks( Parser &$parser, &$text ) {
80  $title = $parser->getTitle();
81  $text = self::parseContent( $title, $text );
82  return true;
83  }
84 
89  private static function parseContent( Title &$title, &$text ) {
90  // If the page contains the magic word '__NOAUTOLINKS__', do not parse it.
91  if ( MagicWord::get('MAG_LINKTITLES_NOAUTOLINKS')->match( $text ) ) {
92  return $text;
93  }
94 
95  // Configuration variables need to be defined here as globals.
101  global $wgLinkTitlesFirstOnly;
104  global $wgLinkTitlesSmartMode;
105  global $wgCapitalLinks;
106 
107  // Use unicode character properties rather than \b escape sequences
108  // to detect whole words containing non-ASCII characters as well.
109  // Note that this requires the use of the '/u' switch, and you need
110  // to have PHP with a PCRE library that was compiled with
111  // --enable-unicode-properties
112  ( $wgLinkTitlesWordStartOnly ) ? $wordStartDelim = '(?<!\pL)' : $wordStartDelim = '';
113  ( $wgLinkTitlesWordEndOnly ) ? $wordEndDelim = '(?!\pL)' : $wordEndDelim = '';
114 
115  ( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC';
116  ( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1;
117 
118  if ( $wgLinkTitlesSkipTemplates )
119  {
120  $templatesDelimiter = '{{[^}]+}}|';
121  } else {
122  // Match template names (ignoring any piped [[]] links in them)
123  // along with the trailing pipe and parameter name or closing
124  // braces; also match sequences of '|wordcharacters=' (without
125  // spaces in them) that usually only occur as parameter names in
126  // transclusions (but could also occur as wiki table cell contents).
127  // TODO: Find a way to match parameter names in transclusions, but
128  // not in table cells or other sequences involving a pipe character
129  // and equal sign.
130  $templatesDelimiter = '{{[^|]*?(?:(?:\[\[[^]]+]])?)[^|]*?(?:\|(?:\w+=)?|(?:}}))|\|\w+=|';
131  };
132 
133  LinkTitles::$currentTitle = $title;
134  $newText = $text;
135 
136  // Build a regular expression that will capture existing wiki links ("[[...]]"),
137  // wiki headings ("= ... =", "== ... ==" etc.),
138  // urls ("http://example.com", "[http://example.com]", "[http://example.com Description]",
139  // and email addresses ("mail@example.com").
140  // Since there is a user option to skip headings, we make this part of the expression
141  // optional. Note that in order to use preg_split(), it is important to have only one
142  // capturing subpattern (which precludes the use of conditional subpatterns).
143  ( $wgLinkTitlesParseHeadings ) ? $delimiter = '' : $delimiter = '=+.+?=+|';
144  $urlPattern = '[a-z]+?\:\/\/(?:\S+\.)+\S+(?:\/.*)?';
145  $delimiter = '/(' . // exclude from linking:
146  '\[\[.*?\]\]|' . // links
147  $delimiter . // titles (if requested)
148  $templatesDelimiter . // templates (if requested)
149  '^ .+?\n|\n .+?\n|\n .+?$|^ .+?$|' . // preformatted text
150  '<nowiki>.*?<.nowiki>|<code>.*?<\/code>|' . // nowiki/code
151  '<pre>.*?<\/pre>|<html>.*?<\/html>|' . // pre/html
152  '<script>.*?<\/script>|' . // script
153  '<div.+?>|<\/div>|' . // attributes of div elements
154  '<span.+?>|<\/span>|' . // attributes of span elements
155  '<file>[^<]*<\/file>|' . // attributes of span elements
156  'style=".+?"|class=".+?"|' . // styles and classes (e.g. of wikitables)
157  '\[' . $urlPattern . '\s.+?\]|'. $urlPattern . '(?=\s|$)|' . // urls
158  '(?<=\b)\S+\@(?:\S+\.)+\S+(?=\b)' . // email addresses
159  ')/ism';
160 
161  // Build a blacklist of pages that are not supposed to be link
162  // targets. This includes the current page.
163  $blackList = str_replace( '_', ' ',
164  '("' . implode( '", "',$wgLinkTitlesBlackList ) . '", "' .
165  LinkTitles::$currentTitle->getDbKey() . '")' );
166 
167  // Build an SQL query and fetch all page titles ordered by length from
168  // shortest to longest. Only titles from 'normal' pages (namespace uid
169  // = 0) are returned. Since the db may be sqlite, we need a try..catch
170  // structure because sqlite does not support the CHAR_LENGTH function.
171  $dbr = wfGetDB( DB_SLAVE );
172  try {
173  $res = $dbr->select(
174  'page',
175  'page_title',
176  array(
177  'page_namespace = 0',
178  'CHAR_LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength,
179  'page_title NOT IN ' . $blackList,
180  ),
181  __METHOD__,
182  array( 'ORDER BY' => 'CHAR_LENGTH(page_title) ' . $sort_order )
183  );
184  } catch (Exception $e) {
185  $res = $dbr->select(
186  'page',
187  'page_title',
188  array(
189  'page_namespace = 0',
190  'LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength,
191  'page_title NOT IN ' . $blackList,
192  ),
193  __METHOD__,
194  array( 'ORDER BY' => 'LENGTH(page_title) ' . $sort_order )
195  );
196  }
197 
198  // Iterate through the page titles
199  foreach( $res as $row ) {
200  LinkTitles::newTarget( $row->page_title );
201 
202  // split the page content by [[...]] groups
203  // credits to inhan @ StackOverflow for suggesting preg_split
204  // see http://stackoverflow.com/questions/10672286
205  $arr = preg_split( $delimiter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE );
206 
207  // Escape certain special characters in the page title to prevent
208  // regexp compilation errors
209  LinkTitles::$targetTitleText = LinkTitles::$targetTitle->getText();
210  $quotedTitle = preg_quote(LinkTitles::$targetTitleText, '/');
211 
212  // Depending on the global configuration setting $wgCapitalLinks,
213  // the title has to be searched for either in a strictly case-sensitive
214  // way, or in a 'fuzzy' way where the first letter of the title may
215  // be either case.
216  if ( $wgCapitalLinks && ( $quotedTitle[0] != '\\' )) {
217  $searchTerm = '((?i)' . $quotedTitle[0] . '(?-i)' .
218  substr($quotedTitle, 1) . ')';
219  } else {
220  $searchTerm = '(' . $quotedTitle . ')';
221  }
222 
223  for ( $i = 0; $i < count( $arr ); $i+=2 ) {
224  // even indexes will point to text that is not enclosed by brackets
225  $arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' .
226  $wordStartDelim . $searchTerm . $wordEndDelim . '/u',
227  array('LinkTitles', 'simpleModeCallback'), $arr[$i], $limit, $count );
228  if (( $limit >= 0 ) && ( $count > 0 )) {
229  break;
230  };
231  };
232  $newText = implode( '', $arr );
233 
234  // If smart mode is turned on, the extension will perform a second
235  // pass on the page and add links with aliases where the case does
236  // not match.
237  if ($wgLinkTitlesSmartMode) {
238  $arr = preg_split( $delimiter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE );
239 
240  for ( $i = 0; $i < count( $arr ); $i+=2 ) {
241  // even indexes will point to text that is not enclosed by brackets
242  $arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' .
243  $wordStartDelim . '(' . $quotedTitle . ')' .
244  $wordEndDelim . '/iu', array('LinkTitles', 'smartModeCallback'),
245  $arr[$i], $limit, $count );
246  if (( $limit >= 0 ) && ( $count > 0 )) {
247  break;
248  };
249  };
250  $newText = implode( '', $arr );
251  } // $wgLinkTitlesSmartMode
252  }; // foreach $res as $row
253  return $newText;
254  }
255 
264  public static function processPage($title, RequestContext $context) {
265  // TODO: make this namespace-aware
266  $titleObj = Title::makeTitle(0, $title);
267  $page = WikiPage::factory($titleObj);
268  $content = $page->getContent();
269  $text = $content->getContentHandler()->serializeContent($content);
270  $newText = LinkTitles::parseContent($titleObj, $text);
271  if ( $text != $newText ) {
272  $content = $content->getContentHandler()->unserializeContent( $newText );
273  $page->doQuickEditContent($content,
274  $context->getUser(),
275  "Links to existing pages added by LinkTitles bot.",
276  true // minor modification
277  );
278  };
279  }
280 
286  public static function onGetDoubleUnderscoreIDs( array &$doubleUnderscoreIDs ) {
287  $doubleUnderscoreIDs[] = 'MAG_LINKTITLES_NOTARGET';
288  $doubleUnderscoreIDs[] = 'MAG_LINKTITLES_NOAUTOLINKS';
289  return true;
290  }
291 
292  // Build an anonymous callback function to be used in simple mode.
293  private static function simpleModeCallback( array $matches ) {
294  if ( LinkTitles::checkTargetPage() ) {
295  return '[[' . $matches[0] . ']]';
296  }
297  else
298  {
299  return $matches[0];
300  }
301  }
302 
303  // Callback function for use with preg_replace_callback.
304  // This essentially performs a case-sensitive comparison of the
305  // current page title and the occurrence found on the page; if
306  // the cases do not match, it builds an aliased (piped) link.
307  // If $wgCapitalLinks is set to true, the case of the first
308  // letter is ignored by MediaWiki and we don't need to build a
309  // piped link if only the case of the first letter is different.
310  private static function smartModeCallback( array $matches ) {
311  global $wgCapitalLinks;
312 
313  if ( $wgCapitalLinks ) {
314  // With $wgCapitalLinks set to true we have a slightly more
315  // complicated version of the callback than if it were false;
316  // we need to ignore the first letter of the page titles, as
317  // it does not matter for linking.
318  if ( LinkTitles::checkTargetPage() ) {
319  if ( strcmp(substr(LinkTitles::$targetTitleText, 1), substr($matches[0], 1)) == 0 ) {
320  // Case-sensitive match: no need to bulid piped link.
321  return '[[' . $matches[0] . ']]';
322  } else {
323  // Case-insensitive match: build piped link.
324  return '[[' . LinkTitles::$targetTitleText . '|' . $matches[0] . ']]';
325  }
326  }
327  else
328  {
329  return $matches[0];
330  }
331  } else {
332  // If $wgCapitalLinks is false, we can use the simple variant
333  // of the callback function.
334  if ( LinkTitles::checkTargetPage() ) {
335  if ( strcmp(LinkTitles::$targetTitleText, $matches[0]) == 0 ) {
336  // Case-sensitive match: no need to bulid piped link.
337  return '[[' . $matches[0] . ']]';
338  } else {
339  // Case-insensitive match: build piped link.
340  return '[[' . LinkTitles::$targetTitleText . '|' . $matches[0] . ']]';
341  }
342  }
343  else
344  {
345  return $matches[0];
346  }
347  }
348  }
349 
351  private static function newTarget( $title ) {
352  // @todo Make this wiki namespace aware.
353  LinkTitles::$targetTitle = Title::makeTitle( NS_MAIN, $title);
354  LinkTitles::$targetContent = null;
355  }
356 
363  private static function getTargetContent() {
364  if ( ! isset( $targetContent ) ) {
365  LinkTitles::$targetContent = WikiPage::factory(
366  LinkTitles::$targetTitle)->getContent();
367  };
368  return LinkTitles::$targetContent;
369  }
370 
377  private static function checkTargetPage() {
380 
381  // If checking for redirects is enabled and the target page does
382  // indeed redirect to the current page, return the page title as-is
383  // (unlinked).
384  if ( $wgLinkTitlesCheckRedirect ) {
385  $redirectTitle = LinkTitles::getTargetContent()->getUltimateRedirectTarget();
386  if ( $redirectTitle && $redirectTitle->equals(LinkTitles::$currentTitle) ) {
387  return false;
388  }
389  };
390 
391  // If the magic word __NOAUTOLINKTARGET__ is enabled and the target
392  // page does indeed contain this magic word, return the page title
393  // as-is (unlinked).
394  if ( $wgLinkTitlesEnableNoTargetMagicWord ) {
395  if ( LinkTitles::getTargetContent()->matchMagicWord(
396  MagicWord::get('MAG_LINKTITLES_NOTARGET') ) ) {
397  return false;
398  }
399  };
400  return true;
401  }
402  }
403 
404 // vim: ts=2:sw=2:noet:comments^=\:///
Central class of the extension.
dump($var)
Helper function for development and debugging.
$wgLinkTitlesBlackList
Blacklist of page titles that should never be linked.
Definition: LinkTitles.php:100
$wgLinkTitlesWordEndOnly
Determines whether a page title must end with the end of a word in order for it to be linked...
Definition: LinkTitles.php:138
$wgLinkTitlesFirstOnly
Determines whether to link only the first occurrence of a page title on a page or all occurrences...
Definition: LinkTitles.php:106
$wgLinkTitlesEnableNoTargetMagicWord
Determines whether or not the magic word NOAUTOLINKTARGET is enabled or not.
Definition: LinkTitles.php:182
$wgLinkTitlesSkipTemplates
Determines whether to parse text inside templates.
Definition: LinkTitles.php:94
$wgLinkTitlesParseHeadings
Determines whether or not to insert links into headings.
Definition: LinkTitles.php:60
$wgLinkTitlesPreferShortTitles
Controls precedence of page titles.
Definition: LinkTitles.php:51
static onPageContentSave(&$wikiPage, &$user, &$content, &$summary, $isMinor, $isWatch, $section, &$flags, &$status)
Event handler that is hooked to the PageContentSave event.
$wgLinkTitlesWordStartOnly
Determines whether a page title must occur at the start of a word in order for it to be linked...
Definition: LinkTitles.php:120
$wgLinkTitlesMinimumTitleLength
The minimum number of characters in a title that is required for it to be automatically linked to...
Definition: LinkTitles.php:56
static onGetDoubleUnderscoreIDs(array &$doubleUnderscoreIDs)
Adds the two magic words defined by this extension to the list of 'double-underscore' terms that are ...
$wgLinkTitlesParseOnRender
Important configuration variable that determines when the extension will process a page...
Definition: LinkTitles.php:84
$wgLinkTitlesSmartMode
Important setting that controls the extension's smart mode of operation.
Definition: LinkTitles.php:153
static onInternalParseBeforeLinks(Parser &$parser, &$text)
Event handler that is hooked to the InternalParseBeforeLinks event.
$wgLinkTitlesParseOnEdit
Important configuration variable that determines when the extension will process a page...
Definition: LinkTitles.php:71
static setup()
Setup function, hooks the extension's functions to MediaWiki events.
static processPage($title, RequestContext $context)
Automatically processes a single page, given a $title Title object.
$wgLinkTitlesCheckRedirect
Determines whether or not to check if a page redirects to the current page.
Definition: LinkTitles.php:167