Après avoir cherché et utilisé des fonctions html2text, je me suis aperçu qu’aucune ne me satisfaisait vraiment parce que mes besoins n’étaient pas seulement de transcrire en texte.
Pour un projet de moteur de recherche, j’ai besoin de crawler des pages HTML et de sauvegarder seulement les parties intéressantes des pages.
Alors après pas mal de temps à m’amuser avec les regexp, voici quelques lignes de code qui nettoient un code html et essai d’en sortir des phrases:
$c = file_get_contents("http://www.maboite.org");
$c = html_entity_decode($c, ENT_QUOTES , "utf-8");
# delete useless tags
$c = preg_replace('@<\s?(script|embed|object|style)([^>]+)?>.*<\s?/\s?\1\s?>@isU','',$c);
# delete comments
$c = preg_replace('@<!--.*-->+@isU','',$c);
#replace spacing tags
$c = preg_replace('@<\s?(br|hr)[^>]*>+@i',"\n",$c);
# replace tabs
$c = preg_replace("@\t@i","\n",$c);
# replacing li and Co by text comma separated
$c = preg_replace('@\s*<\s?(td|li|dt|ot|dd)([^>]+)?>(.*)<\s?/\s?\1\s?>\s*@isU','\3, ',$c);
# closing end of list, delete comma and add point
$c = preg_replace('@\s*,\s*<\s?/\s?(ul|dl|ol)\s?>\s*@isU','.',$c);
# lot of site are using "a" for listing
$c = preg_replace("@(<\s?/\s?a\s?>)([^<]+)?(<\s?a)@isU",'\1, \3',$c);
# replace link by text
$c = preg_replace('@<\s?a[^>]*>([^<]*)<\s?/\s?a\s?>@isU','\1',$c);
# replacing img alt for keeping text
$c = preg_replace('@<\s?img([^>]+)?alt\s?=\s?(["|\'])([^\2]+)?\2[^>]*>@isU',"\n".'\3'."\n",$c);
#get meta desc
$c = preg_replace('@<\s?meta\s?name\s?=\s?(["|\'])\s?description\s?\1\s?(lang=\s?(["|\'])[^\3]+\3)?\scontent\s?=\s?(["|\'])([^\4]+)\4([^>]+)?>@isU',"\n".'\5'."\n",$c);
# get page title
$c = preg_replace('@<\s?title\s?>\s?([^<]+)\s?<\s?/\s?title\s?>@isU',"\n".'\1'."\n",$c);
# get text container tags
$c = preg_replace('@<\s?(p|textarea|div)[^>]*>(.*)<\s?/\1\s?>@isU',"\n".'\2'."\n",$c);
# because sometimes table are used only for displaying table content, add comma
$c = preg_replace('@\s*<\s?(td)([^>]+)?>(.*)<\s?/\s?\1\s?>\s*@isU','\3, ',$c);
# Each tr is a new ligne
$c = preg_replace('@\s*<\s?(tr)([^>]+)?>(.*)<\s?/\s?\1\s?>\s*@isU','\3'."\n",$c);
# delete all others tags
$c = preg_replace('@<[^>]+>@i',"\n",$c);
# explode all sentence which ended with point and start with an uppercase letter
$c = preg_replace("@(\.(\s*)?([A-Z]))@",".\n".'\3',$c);
# multi space delete
$c = preg_replace("@(\n|\s)+@i",'\1',$c);
# begin as space delete
$c = preg_replace('@^(\s)+@i','',$c);
# sentences ended by :\n must be ended by:
$c = preg_replace("@:\s?\n+@i",': ',$c);
# remove the last comma added when replacing links
$c = preg_replace("@\n,\s+@isU","\n",$c);
$c = preg_replace_callback(
"@.*\n@isU"
,create_function(
'$matches'
,'return (!empty($matches[0]) && strlen($matches[0]) > 50) ? $matches[0] : "";'
)
,$c
);
\\ tags: cleanhtml, html2text

