source-function-F_html_to_text

It appears that you are using AdBlocking software. The cost of running this website is covered by advertisements. If you like it please feel free to a small amount of money to secure the future of this website.
Overview

Classes

Interfaces

Exceptions

Functions

  1: <?php
  2: //============================================================+
  3: // File name   : tce_functions_html2txt.php
  4: // Begin       : 2001-10-21
  5: // Last Update : 2017-04-22
  6: //
  7: // Description : Function to convert HTML code to Text string.
  8: //
  9: // Author: Nicola Asuni
 10: //
 11: // (c) Copyright:
 12: //               Nicola Asuni
 13: //               Tecnick.com LTD
 14: //               www.tecnick.com
 15: //               info@tecnick.com
 16: //
 17: // License:
 18: //    Copyright (C) 2004-2017  Nicola Asuni - Tecnick.com LTD
 19: //    See LICENSE.TXT file for more information.
 20: //============================================================+
 21: 
 22: /**
 23:  * @file
 24:  * Function to convert HTML code to Text string.
 25:  * @package com.tecnick.tcexam.shared
 26:  * @author Nicola Asuni
 27:  * @since 2003-03-31
 28:  */
 29: 
 30: /**
 31:  * Convert HTML code to Text string.
 32:  * @param $str (string) HTML code string to convert.
 33:  * @param $preserve_newlines (boolean) If true convert newline characters to HTML line breaks.
 34:  * @param $display_links (boolean) If true gives a textual representation of links and images.
 35:  * @return text string
 36:  */
 37: function F_html_to_text($str, $preserve_newlines = false, $display_links = false)
 38: {
 39:     require_once('../../shared/code/tce_functions_general.php');
 40: 
 41:     $dollar_replacement = ":.dlr.:"; //string replacement for dollar symbol
 42: 
 43:     //tags conversion table
 44:     $tags2textTable = array (
 45:         "'<br[^>]*?>'i" => "\n",
 46:         "'<p[^>]*?>'i" => "\n",
 47:         "'</p>'i" => "\n",
 48:         "'<div[^>]*?>'i" => "\n",
 49:         "'</div>'i" => "\n",
 50:         "'<table[^>]*?>'i" => "\n",
 51:         "'</table>'i" => "\n",
 52:         "'<tr[^>]*?>'i" => "\n",
 53:         "'<th[^>]*?>'i" => "\t ",
 54:         "'<td[^>]*?>'i" => "\t ",
 55:         "'<li[^>]*?>\t'i" => "\n",
 56:         "'<h[0-9][^>]*?>'i" => "\n\n",
 57:         "'</h[0-9]>'i" => "\n",
 58:         "'<head[^>]*?>.*?</head>'si" => "\n",  // Strip out head
 59:         "'<style[^>]*?>.*?</style>'si" => "\n",  // Strip out style
 60:         "'<script[^>]*?>.*?</script>'si" => "\n"  // Strip out javascript
 61:     );
 62: 
 63:     $str = str_replace("\r\n", "\n", $str);
 64: 
 65:     $str = str_replace("\$", $dollar_replacement, $str); //replace special character
 66: 
 67:     //remove session variable PHPSESSID from links
 68:     $str = preg_replace("/(\?|\&|%3F|%26|\&amp;|%26amp%3B)PHPSESSID(=|%3D)[a-z0-9]{32,32}/i", "", $str);
 69: 
 70:     //remove applet and get alternative content
 71:     $str = preg_replace_callback(
 72:         "/<applet[^>]*?>(.*?)<\/applet>/si",
 73:         function($subs) {
 74:             return preg_replace("/<param[^>]*>/i", "", $subs[1]);
 75:         },
 76:         $str
 77:     );
 78: 
 79:     //remove object and get alternative content
 80:     $str = preg_replace_callback(
 81:         "/<object[^>]*?>(.*?)<\/object>/si",
 82:         function($subs) {
 83:             return preg_replace("/<param[^>]*>/i", "", $subs[1]);
 84:         },
 85:         $str);
 86: 
 87:     //indent list elements
 88:     $firstposition = 0;
 89:     while (($pos=strpos($str, "<ul")) > $firstposition) {
 90:         $str = preg_replace_callback(
 91:             "/<ul[^>]*?>(.*?)<\/ul>/si",
 92:             function($subs) {
 93:                 return preg_replace("/<li[^>]*>/i", "<li>\t", $subs[1]);
 94:             },
 95:             $str);
 96:         $firstposition = $pos;
 97:     }
 98:     $firstposition = 0;
 99:     while (($pos=strpos($str, "<ol")) > $firstposition) {
100:         $str = preg_replace_callback(
101:             "/<ol[^>]*?>(.*?)<\/ol>/si",
102:             function($subs) {
103:                 return preg_replace("/<li[^>]*>/i", "<li>\t", $subs[1]);
104:             },
105:             $str);
106:         $firstposition = $pos;
107:     }
108: 
109:     $str = preg_replace("'<img[^>]*alt[\s]*=[\s]*[\"\']*([^\"\'<>]*)[\"\'][^>]*>'i", "[IMAGE: \\1]", $str);
110: 
111:     // give a textual representation of links and images
112:     if ($display_links) {
113:         $str = preg_replace("'<a[^>]*href[\s]*=[\s]*[\"\']*([^\"\'<>]*)[\"\'][^>]*>(.*?)</a>'si", "\\2 [LINK: \\1]", $str);
114:     }
115: 
116:     if (!$preserve_newlines) { //remove newlines
117:         $str = str_replace("\n", "", $str);
118:     }
119: 
120:     $str = preg_replace(array_keys($tags2textTable), array_values($tags2textTable), $str);
121: 
122:     $str = preg_replace("'<[^>]*?>'si", "", $str); //strip out remaining tags
123: 
124:     //remove some newlines in excess
125:     $str = preg_replace("'[ \t\f]+[\r\n]'si", "\n", $str);
126:     $str = preg_replace("'[\r\n][\r\n]+'si", "\n\n", $str);
127: 
128:     $str = unhtmlentities($str, false);
129: 
130:     $str = str_replace($dollar_replacement, "\$", $str); //restore special character
131: 
132:     return stripslashes(trim($str));
133: }
134: 
135: //============================================================+
136: // END OF FILE
137: //============================================================+
138: 
 

© 2004-2018 – Nicola Asuni - Tecnick.com - All rights reserved.
about - disclaimer - privacy