root/trunk/wp-includes/formatting.php

Revision 1522, 66.5 kB (checked in by donncha, 3 days ago)

WP Merge with revision 9730

Line 
1 <?php
2 /**
3  * Main Wordpress Formatting API.
4  *
5  * Handles many functions for formatting output.
6  *
7  * @package WordPress
8  **/
9
10 /**
11  * Replaces common plain text characters into formatted entities
12  *
13  * As an example,
14  * <code>
15  * 'cause today's effort makes it worth tomorrow's "holiday"...
16  * </code>
17  * Becomes:
18  * <code>
19  * &#8217;cause today&#8217;s effort makes it worth tomorrow&#8217;s &#8220;holiday&#8221;&#8230;
20  * </code>
21  * Code within certain html blocks are skipped.
22  *
23  * @since 0.71
24  * @uses $wp_cockneyreplace Array of formatted entities for certain common phrases
25  *
26  * @param string $text The text to be formatted
27  * @return string The string replaced with html entities
28  */
29 function wptexturize($text) {
30     global $wp_cockneyreplace;
31     $next = true;
32     $has_pre_parent = false;
33     $output = '';
34     $curl = '';
35     $textarr = preg_split('/(<.*>|\[.*\])/Us', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
36     $stop = count($textarr);
37
38     // if a plugin has provided an autocorrect array, use it
39     if ( isset($wp_cockneyreplace) ) {
40         $cockney = array_keys($wp_cockneyreplace);
41         $cockneyreplace = array_values($wp_cockneyreplace);
42     } else {
43         $cockney = array("'tain't","'twere","'twas","'tis","'twill","'til","'bout","'nuff","'round","'cause");
44         $cockneyreplace = array("&#8217;tain&#8217;t","&#8217;twere","&#8217;twas","&#8217;tis","&#8217;twill","&#8217;til","&#8217;bout","&#8217;nuff","&#8217;round","&#8217;cause");
45     }
46
47     $static_characters = array_merge(array('---', ' -- ', '--', 'xn&#8211;', '...', '``', '\'s', '\'\'', ' (tm)'), $cockney);
48     $static_replacements = array_merge(array('&#8212;', ' &#8212; ', '&#8211;', 'xn--', '&#8230;', '&#8220;', '&#8217;s', '&#8221;', ' &#8482;'), $cockneyreplace);
49
50     $dynamic_characters = array('/\'(\d\d(?:&#8217;|\')?s)/', '/(\s|\A|")\'/', '/(\d+)"/', '/(\d+)\'/', '/(\S)\'([^\'\s])/', '/(\s|\A)"(?!\s)/', '/"(\s|\S|\Z)/', '/\'([\s.]|\Z)/', '/(\d+)x(\d+)/');
51     $dynamic_replacements = array('&#8217;$1','$1&#8216;', '$1&#8243;', '$1&#8242;', '$1&#8217;$2', '$1&#8220;$2', '&#8221;$1', '&#8217;$1', '$1&#215;$2');
52
53     for ( $i = 0; $i < $stop; $i++ ) {
54         $curl = $textarr[$i];
55
56         if ( !empty($curl) && '<' != $curl{0} && '[' != $curl{0} && $next && !$has_pre_parent) { // If it's not a tag
57             // static strings
58             $curl = str_replace($static_characters, $static_replacements, $curl);
59             // regular expressions
60             $curl = preg_replace($dynamic_characters, $dynamic_replacements, $curl);
61         } elseif (strpos($curl, '<code') !== false || strpos($curl, '<kbd') !== false || strpos($curl, '<style') !== false || strpos($curl, '<script') !== false) {
62             $next = false;
63         } elseif (strpos($curl, '<pre') !== false) {
64             $has_pre_parent = true;
65         } elseif (strpos($curl, '</pre>') !== false) {
66             $has_pre_parent = false;
67         } else {
68             $next = true;
69         }
70
71         $curl = preg_replace('/&([^#])(?![a-zA-Z1-4]{1,8};)/', '&#038;$1', $curl);
72         $output .= $curl;
73     }
74
75     return $output;
76 }
77
78 /**
79  * Accepts matches array from preg_replace_callback in wpautop() or a string.
80  *
81  * Ensures that the contents of a <<pre>>...<</pre>> HTML block are not
82  * converted into paragraphs or line-breaks.
83  *
84  * @since 1.2.0
85  *
86  * @param array|string $matches The array or string
87  * @return string The pre block without paragraph/line-break conversion.
88  */
89 function clean_pre($matches) {
90     if ( is_array($matches) )
91         $text = $matches[1] . $matches[2] . "</pre>";
92     else
93         $text = $matches;
94
95     $text = str_replace('<br />', '', $text);
96     $text = str_replace('<p>', "\n", $text);
97     $text = str_replace('</p>', '', $text);
98
99     return $text;
100 }
101
102 /**
103  * Replaces double line-breaks with paragraph elements.
104  *
105  * A group of regex replaces used to identify text formatted with newlines and
106  * replace double line-breaks with HTML paragraph tags. The remaining
107  * line-breaks after conversion become <<br />> tags, unless $br is set to '0'
108  * or 'false'.
109  *
110  * @since 0.71
111  *
112  * @param string $pee The text which has to be formatted.
113  * @param int|bool $br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
114  * @return string Text which has been converted into correct paragraph tags.
115  */
116 function wpautop($pee, $br = 1) {
117     $pee = $pee . "\n"; // just to make things a little easier, pad the end
118     $pee = preg_replace('|<br />\s*<br />|', "\n\n", $pee);
119     // Space things out a little
120     $allblocks = '(?:table|thead|tfoot|caption|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr)';
121     $pee = preg_replace('!(<' . $allblocks . '[^>]*>)!', "\n$1", $pee);
122     $pee = preg_replace('!(</' . $allblocks . '>)!', "$1\n\n", $pee);
123     $pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
124     if ( strpos($pee, '<object') !== false ) {
125         $pee = preg_replace('|\s*<param([^>]*)>\s*|', "<param$1>", $pee); // no pee inside object/embed
126         $pee = preg_replace('|\s*</embed>\s*|', '</embed>', $pee);
127     }
128     $pee = preg_replace("/\n\n+/", "\n\n", $pee); // take care of duplicates
129     // make paragraphs, including one at the end
130     $pees = preg_split('/\n\s*\n/', $pee, -1, PREG_SPLIT_NO_EMPTY);
131     $pee = '';
132     foreach ( $pees as $tinkle )
133         $pee .= '<p>' . trim($tinkle, "\n") . "</p>\n";
134     $pee = preg_replace('|<p>\s*?</p>|', '', $pee); // under certain strange conditions it could create a P of entirely whitespace
135     $pee = preg_replace('!<p>([^<]+)\s*?(</(?:div|address|form)[^>]*>)!', "<p>$1</p>$2", $pee);
136     $pee = preg_replace( '|<p>|', "$1<p>", $pee );
137     $pee = preg_replace('!<p>\s*(</?' . $allblocks . '[^>]*>)\s*</p>!', "$1", $pee); // don't pee all over a tag
138     $pee = preg_replace("|<p>(<li.+?)</p>|", "$1", $pee); // problem with nested lists
139     $pee = preg_replace('|<p><blockquote([^>]*)>|i', "<blockquote$1><p>", $pee);
140     $pee = str_replace('</blockquote></p>', '</p></blockquote>', $pee);
141     $pee = preg_replace('!<p>\s*(</?' . $allblocks . '[^>]*>)!', "$1", $pee);
142     $pee = preg_replace('!(</?' . $allblocks . '[^>]*>)\s*</p>!', "$1", $pee);
143     if ($br) {
144         $pee = preg_replace_callback('/<(script|style).*?<\/\\1>/s', create_function('$matches', 'return str_replace("\n", "<WPPreserveNewline />", $matches[0]);'), $pee);
145         $pee = preg_replace('|(?<!<br />)\s*\n|', "<br />\n", $pee); // optionally make line breaks
146         $pee = str_replace('<WPPreserveNewline />', "\n", $pee);
147     }
148     $pee = preg_replace('!(</?' . $allblocks . '[^>]*>)\s*<br />!', "$1", $pee);
149     $pee = preg_replace('!<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)!', '$1', $pee);
150     if (strpos($pee, '<pre') !== false)
151         $pee = preg_replace_callback('!(<pre.*?>)(.*?)</pre>!is', 'clean_pre', $pee );
152     $pee = preg_replace( "|\n</p>$|", '</p>', $pee );
153     $pee = preg_replace('/<p>\s*?(' . get_shortcode_regex() . ')\s*<\/p>/s', '$1', $pee); // don't auto-p wrap shortcodes that stand alone
154
155     return $pee;
156 }
157
158 /**
159  * Checks to see if a string is utf8 encoded.
160  *
161  * @author bmorel at ssi dot fr
162  *
163  * @since 1.2.1
164  *
165  * @param string $Str The string to be checked
166  * @return bool True if $Str fits a UTF-8 model, false otherwise.
167  */
168 function seems_utf8($Str) { # by bmorel at ssi dot fr
169     $length = strlen($Str);
170     for ($i=0; $i < $length; $i++) {
171         if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
172         elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
173         elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
174         elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
175         elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
176         elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
177         else return false; # Does not match any model
178         for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
179             if ((++$i == $length) || ((ord($Str[$i]) & 0xC0) != 0x80))
180             return false;
181         }
182     }
183     return true;
184 }
185
186 /**
187  * Converts a number of special characters into their HTML entities.
188  *
189  * Differs from htmlspecialchars as existing HTML entities will not be encoded.
190  * Specifically changes: & to &#038;, < to &lt; and > to &gt;.
191  *
192  * $quotes can be set to 'single' to encode ' to &#039;, 'double' to encode " to
193  * &quot;, or '1' to do both. Default is 0 where no quotes are encoded.
194  *
195  * @since 1.2.2
196  *
197  * @param string $text The text which is to be encoded.
198  * @param mixed $quotes Optional. Converts single quotes if set to 'single', double if set to 'double' or both if otherwise set. Default 0.
199  * @return string The encoded text with HTML entities.
200  */
201 function wp_specialchars( $text, $quotes = 0 ) {
202     // Like htmlspecialchars except don't double-encode HTML entities
203     $text = str_replace('&&', '&#038;&', $text);
204     $text = str_replace('&&', '&#038;&', $text);
205     $text = preg_replace('/&(?:$|([^#])(?![a-z1-4]{1,8};))/', '&#038;$1', $text);
206     $text = str_replace('<', '&lt;', $text);
207     $text = str_replace('>', '&gt;', $text);
208     if ( 'double' === $quotes ) {
209         $text = str_replace('"', '&quot;', $text);
210     } elseif ( 'single' === $quotes ) {
211         $text = str_replace("'", '&#039;', $text);
212     } elseif ( $quotes ) {
213         $text = str_replace('"', '&quot;', $text);
214         $text = str_replace("'", '&#039;', $text);
215     }
216     return $text;
217 }
218
219 /**
220  * Encode the Unicode values to be used in the URI.
221  *
222  * @since 1.5.0
223  *
224  * @param string $utf8_string
225  * @param int $length Max length of the string
226  * @return string String with Unicode encoded for URI.
227  */
228 function utf8_uri_encode( $utf8_string, $length = 0 ) {
229     $unicode = '';
230     $values = array();
231     $num_octets = 1;
232     $unicode_length = 0;
233
234     $string_length = strlen( $utf8_string );
235     for ($i = 0; $i < $string_length; $i++ ) {
236
237         $value = ord( $utf8_string[ $i ] );
238
239         if ( $value < 128 ) {
240             if ( $length && ( $unicode_length >= $length ) )
241                 break;
242             $unicode .= chr($value);
243             $unicode_length++;
244         } else {
245             if ( count( $values ) == 0 ) $num_octets = ( $value < 224 ) ? 2 : 3;
246
247             $values[] = $value;
248
249             if ( $length && ( $unicode_length + ($num_octets * 3) ) > $length )
250                 break;
251             if ( count( $values ) == $num_octets ) {
252                 if ($num_octets == 3) {
253                     $unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]) . '%' . dechex($values[2]);
254                     $unicode_length += 9;
255                 } else {
256                     $unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]);
257                     $unicode_length += 6;
258                 }
259
260                 $values = array();
261                 $num_octets = 1;
262             }
263         }
264     }
265
266     return $unicode;
267 }
268
269 /**
270  * Converts all accent characters to ASCII characters.
271  *
272  * If there are no accent characters, then the string given is just returned.
273  *
274  * @since 1.2.1
275  *
276  * @param string $string Text that might have accent characters
277  * @return string Filtered string with replaced "nice" characters.
278  */
279 function remove_accents($string) {
280     if ( !preg_match('/[\x80-\xff]/', $string) )
281         return $string;
282
283     if (seems_utf8($string)) {
284         $chars = array(
285         // Decompositions for Latin-1 Supplement
286         chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
287         chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
288         chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
289         chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
290         chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
291         chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
292         chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
293         chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
294         chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
295         chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
296         chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
297         chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
298         chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
299         chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
300         chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
301         chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
302         chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
303         chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
304         chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
305         chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
306         chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
307         chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
308         chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
309         chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
310         chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
311         chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
312         chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
313         chr(195).chr(191) => 'y',
314         // Decompositions for Latin Extended-A
315         chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
316         chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
317         chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
318         chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
319         chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
320         chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
321         chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
322         chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
323         chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
324         chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
325         chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
326         chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
327         chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
328         chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
329         chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
330         chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
331         chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
332         chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
333         chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
334         chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
335         chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
336         chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
337         chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
338         chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
339         chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
340         chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
341         chr(196).chr(180) => 'J',