LinkTitles extension for MediaWiki
Automatically add links to existing pages.
Splitter.php
1 <?php
24 namespace LinkTitles;
25 
29 class Splitter {
35  public $splitter;
36 
41  public $config;
42 
43  private static $instance;
44 
55  public static function singleton( Config &$config = null ) {
56  if ( self::$instance === null ) {
57  if ( $config === null ) {
58  $config = new Config();
59  }
60  self::$instance = new Splitter( $config );
61  }
62  return self::$instance;
63  }
64 
70  public static function invalidate() {
71  self::$instance = null;
72  }
73 
74  protected function __construct( Config $config) {
75  $this->config = $config;
76  $this->buildSplitter();
77  }
78 
86  public function split( &$text ) {
87  return preg_split( $this->splitter, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
88  }
89 
90  /*
91  * Builds the delimiter that is used in a regexp to separate
92  * text that should be parsed from text that should not be
93  * parsed (e.g. inside existing links etc.)
94  */
95  private function buildSplitter() {
96  if ( $this->config->skipTemplates )
97  {
98  // Use recursive regex to balance curly braces;
99  // see http://www.regular-expressions.info/recurse.html
100  $templatesDelimiter = '{{(?>[^{}]|(?R))*}}|';
101  } else {
102  // Match template names (ignoring any piped [[]] links in them)
103  // along with the trailing pipe and parameter name or closing
104  // braces; also match sequences of '|wordcharacters=' (without
105  // spaces in them) that usually only occur as parameter names in
106  // transclusions (but could also occur as wiki table cell contents).
107  // TODO: Find a way to match parameter names in transclusions, but
108  // not in table cells or other sequences involving a pipe character
109  // and equal sign.
110  $templatesDelimiter = '{{[^|]*?(?:(?:\[\[[^]]+]])?)[^|]*?(?:\|(?:\w+=)?|(?:}}))|\|\w+=|';
111  }
112 
113  // Build a regular expression that will capture existing wiki links ("[[...]]"),
114  // wiki headings ("= ... =", "== ... ==" etc.),
115  // urls ("http://example.com", "[http://example.com]", "[http://example.com Description]",
116  // and email addresses ("mail@example.com").
117 
118  // Match WikiText headings.
119  // Since there is a user option to skip headings, we make this part of the
120  // expression optional. Note that in order to use preg_split(), it is
121  // important to have only one capturing subpattern (which precludes the use
122  // of conditional subpatterns).
123  // Caveat: This regex pattern should be improved to deal with balanced '='s
124  // only. However, this would require grouping in the pattern which does not
125  // agree with preg_split.
126  $headingsDelimiter = $this->config->parseHeadings ? '' : '^=+[^=]+=+$|';
127 
128  $urlPattern = '[a-z]+?\:\/\/(?:\S+\.)+\S+(?:\/.*)?';
129  $this->splitter = '/(' . // exclude from linking:
130  '\[\[.*?\]\]|' . // links
131  $headingsDelimiter . // headings (if requested)
132  $templatesDelimiter . // templates (if requested)
133  '^ .+?\n|\n .+?\n|\n .+?$|^ .+?$|' . // preformatted text
134  '<nowiki>.*?<.nowiki>|<code>.*?<\/code>|' . // nowiki/code
135  '<pre>.*?<\/pre>|<html>.*?<\/html>|' . // pre/html
136  '<script>.*?<\/script>|' . // script
137  '<syntaxhighlight.*?>.*?<\/syntaxhighlight>|' . // syntaxhighlight
138  '<gallery>.*?<\/gallery>|' . // gallery
139  '<div.*?>|<\/div>|' . // attributes of div elements
140  '<span.*?>|<\/span>|' . // attributes of span elements
141  '<file>[^<]*<\/file>|' . // stuff inside file elements
142  'style=".+?"|class=".+?"|' . // styles and classes (e.g. of wikitables)
143  '<noautolinks>.*?<\/noautolinks>|' . // custom tag 'noautolinks'
144  '\[' . $urlPattern . '\s.+?\]|'. $urlPattern . '(?=\s|$)|' . // urls
145  '(?<=\b)\S+\@(?:\S+\.)+\S+(?=\b)' . // email addresses
146  ')/ismS';
147  }
148 }
$splitter
The splitting expression that separates text to be parsed from text that must not be parsed...
Definition: Splitter.php:35
$config
The LinkTitles configuration for this Splitter instance.
Definition: Splitter.php:41
static singleton(Config &$config=null)
Gets the Splitter singleton; may build one with the given config or the default config if none is giv...
Definition: Splitter.php:55
Holds LinkTitles configuration.
Definition: Config.php:37
Caches a regular expression that delimits text to be parsed.
Definition: Splitter.php:29
The LinkTitles class holds configuration for the LinkTitles extension.
Definition: Config.php:23
split(&$text)
Splits a text into sections that may be linked and sections that may not be linked (e...
Definition: Splitter.php:86
static invalidate()
Invalidates the singleton instance.
Definition: Splitter.php:70