.01
.02
.03
.04
.05
.06
.07
.08
.09
.10
.11
.12
.13
.14
.15
.16
.17
.18
.19
.20
.21
.22
.23
.24
.25
.26
.27
.28
.29
.30
.31
.32
.33
.34
.35
.36
.37
.38
.39
.40
.41
.42
.43
.44
.45
.46
.47
.48
.49
.50
.51
.52
.53
.54
.55
.56
.57
.58
.59
.60
.61
.62
.63
.64
.65
.66
.67
.68
.69
.70
.71
.72
.73
.74
.75
.76
.77
.78
.79
.80
.81
.82
.83
.84
.85
.86
.87
.88
.89
.90
.91
.92
.93
.94
.95
.96
.97
.98
.99
.100
.101
.102
.103
.104
.105
.106
.107
.108
.109
.110
.111
.112
.113
.114
.115
.116
.117
.118
.119
.120
.121
.122
.123
.124
.125
.126
.127
.128
.129
.130
.131
.132
.133
.134
.135
.136
.137
.138
.139
.140
.141
.142
.143
.144
.145
.146
.147
.148
.149
.150
.151
.152
.153
.154
.155
.156
.157
.158
.159
.160
.161
.162
.163
.164
.165
.166
.167
.168
.169
.170
.171
.172
.173
.174
.175
.176
.177
.178
.179
.180
.181
.182
.183
.184
.185
.186
.187
.188
.189
.190
.191
.192
.193
.194
.195
.196
.197
.198
.199
.200
.201
.202
.203
.204
.205
.206
.207
.208
.209
.210
.211
.212
.213
.214
.215
.216
.217
.218
.219
.220
.221
.222
.223
.224
.225
.226
.227
.228
.229
.230
.231
.232
.233
.234
.235
.236
.237
.238
.239
.240
.241
.242
.243
.244
.245
.246
.247
.248
.249
.250
.251
.252
.253
.254
.255
.256
.257
.258
.259
.260
.261
.262
.263
.264
.265
.266
.267
.268
.269
.270
.271
.272
.273
.274
.275
.276
.277
.278
.279
.280
.281
.282
.283
.284
.285
.286
.287
.288
.289
|
|
<?php
/******************************************************************************/
/* */
/* __ ____ */
/* ___ / / ___ / __/__ __ _____________ ___ */
/* / _ \/ _ \/ _ \_\ \/ _ \/ // / __/ __/ -_|_-< */
/* / .__/_//_/ .__/___/\___/\_,_/_/ \__/\__/___/ */
/* /_/ /_/ */
/* */
/* */
/******************************************************************************/
/* */
/* Titre : Connaitre les doublons dans une table SQL */
/* */
/* URL : http://www.phpsources.org/scripts333-PHP.htm */
/* Auteur : KOogar */
/* Date édition : 31 Jan 2008 */
/* Website auteur : http://www.phpsources.org */
/* */
/******************************************************************************/
/*******************************************************************************
* connection sql
***************************************************************************/
$connection = mysql_connect("localhost","root","motdepasse");
if ( ! $connection )
die ("connection impossible");
$mabasededonnee="Client";
mysql_select_db($mabasededonnee) or die ("pas de connection");
/*******************************************************************************
* Initialisation
***************************************************************************/
$valeur_pourcentage = 60; // la valeur du pourcentage de mots communs
// le doublon parfait c'est 100% de mots communs!!
// Donc $valeur_pourcentage sera initialiser à 100
// dans ce cas.
// Si vous tolerez 90% de mots communs, mettez 90 ;)
$table_sql = ""; // nom de la table sql
$champ_id = ""; // nom de votre identifiant
$champ_recherche_sql = ""; // nom du champ sur lequel on effectue la
// recherche de similarité
$nbre_elements = 15; // commencer petit, le script est long a executer
// le nombre d'element se trouve dna la premiere boucle de l'algo
// si vous souhaitez boucler sur la table entiere, remplacer
// $nbre_elements par sizeof($tab_matchv)
/*******************************************************************************
* le temps de déroulement du script
***************************************************************************/
$mtime = microtime();
$mtime = explode(" ",$mtime);
$mtime = $mtime[1] + $mtime[0];
$starttime = $mtime;
/*******************************************************************************
* enlève les accents
***************************************************************************/
function TexteSansAccent($texte){
$accent='ÀÁÂÃÄÅàáâãäåÒÓÔÕÖØòóôõöøÈÉÊËéèêëÇçÌÍÎÏìíîïÙÚÛÜùúûüÿÑñ';
$noaccent='AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn';
$texte = strtr($texte,$accent,$noaccent);
return $texte;
}
/*******************************************************************************
* magic_quote
***************************************************************************/
function AuStrip_Slashes($chaine) {
return(get_magic_quotes_gpc() == 1 ? StripSlashes($chaine) : $chaine);}
/*******************************************************************************
* Stop Words !!
***************************************************************************/
$stop_words = array("alors","au","aucuns","aussi","autre","avant","avec","avoir",
"bon","car","ce","cela","ces","ceux", "chaque","ci","comme",
"comment","dans","des","du","dedans","dehors","depuis","deux",
"devrait","doit", "donc","dos","droite","début","elle","elles",
"en","encore","essai","est","et","eu","fait","faites", "fois",
"font","force","haut","hors","ici","il","ils","je juste","la",
"le","les","leur","là","ma",
"maintenant","mais","mes","mine","moins","mon","mot","même","ni",
"nommés","notre","nous","nouveaux",
"ou","où","par","parce","parole","pas","personnes","peut","peu",
"pièce","plupart","pour","pourquoi",
"quand","que","quel","quelle","quelles","quels","qui","sa",
"sans","ses","seulement","si","sien","son",
"sont","sous","soyez sujet","sur","ta","tandis","tellement",
"tels","tes","ton","tous","tout","trop",
"très","tu","valeur","voie","voient","vont","votre","vous",
"vu","ça","étaient","état","étions","été","être",
"un","deux","trois","quatre","cinq","six","sept","huit","neuf","dix",
"0","1","2","3","4","5","6","7","8","9","10",
"avec","chez","par","dans","des","en","de","une","votre",
"meilleurs","entre","entres",
"depuis","alors","ne","pas","du","meme",
"ou","nom","seuls","acceptes","ayant",
"mais","ou","et","donc","or","ni","car",
"vos","votre","mes","mien","mien","tien","tiens","tout",
"toute","toutes",
"que","quoi","qui","comment","peu","peut","pis","puis","pas",
"chaque","chacun","chacune",
"son","ses","au","aux","se","sur","ce","ceux","cette","ca",
"ci","ceci","cela","aussi","pour",
"petit","grand","moyen","large","haut","bas","milieu","droite",
"gauche","centre",
"dit","etre","leur","leurs",
"plus","moin","moins",
"es","est","sont","son","va","suis","ai","viens",
"a","b","c","d","e","f","g","h","i","l","m","n","o","p","q",
"r","s","t","u","v","w","x","y","z"
);
/*******************************************************************************
* Set le temps d'excecution
***************************************************************************/
set_time_limit(0);
/*******************************************************************************
* Premiere requete -place tout les champs dans un tableau
***************************************************************************/
$qv = mysql_query("SELECT $champ_id,$champ_recherche_sql FROM $table_sql");
$p=0;
$regs_split = array();
$tab_matchv = array();
while ($rv = mysql_fetch_array($qv)) {
$tab_matchv[$p] .= $rv[$champ_id];
$p++;
// Place tous les mots d'une chaine dans un tableau
$ch = $rv[$champ_recherche_sql];
// enleve les slashs
$ch = AuStrip_Slashes($ch);
// enleve les accent
$ch = TexteSansAccent($ch);// enleve les accent
// enleve les apostrophes
$ch = str_replace("'"," ",$ch);
// passe le texte en minuscule
$ch = strtolower($ch); // passe minuscule
$regs_split = split("[^[:alpha:]]+", $ch);
// enleve les mots bannis en faisant le difference des 2 tableaux
$regs_split2 = array_diff($regs_split,$stop_words);
//print_r($regs_split2);
$rv = implode(',',$regs_split2);
//echo $rv; echo '<br />';
$tab_matchv[$p] .= $rv;
$p++;
}
/*******************************************************************************
* seconde requete - place tout les champs dans un tableau
***************************************************************************/
$q = mysql_query("SELECT $champ_id,$champ_recherche_sql FROM $table_sql");
$i=0;
$regs1_split = array();
$tab_match = array();
while ($r = mysql_fetch_array($q)) {
$tab_match[$i] .= $r[$champ_id];
$i++;
// Place tous les mots d'une chaine dans un tableau
$ch = $r[$champ_recherche_sql];
// enleve les slashs
$ch = AuStrip_Slashes($ch);
// enleve les accent
$ch = TexteSansAccent($ch);// enleve les accent
// enleve les apostrophes
$ch = str_replace("'"," ",$ch);
// passe le texte en minuscule
$ch = strtolower($ch); // passe minuscule
$regs1_split = split("[^[:alpha:]]+", $ch);
// enleve les mots bannis en faisant le difference des 2 tableaux
$regs1_split2 = array_diff($regs1_split,$stop_words);
$rv = implode(',',$regs1_split2);
$tab_match[$i] .= $rv;
$i++;
}
/*******************************************************************************
* Sortie ecrans
***************************************************************************/
echo "<strong>Nombre de champs : ".round(sizeof($tab_match)/2)."<br /><br /></strong>";
echo "<strong>Verifie les $nbre_elements premiers éléments<br /><br />";
echo "<strong>Tolérance maximum de mots communs en pourcentage :</strong>
".$valeur_pourcentage."%</strong><br /><br />";
/*******************************************************************************
* l'algo
***************************************************************************/
$p = 0;
$k = 0;
$delete_id_tb1 = array();
$delete_id_tb2 = array();
while ($p < $nbre_elements)
{
$p++;
$j = 0;
while ($j < sizeof($tab_match))
{
$j++;
$regs = array();
$reception = array();
// on prend un champ du tableau
$ch_tab_match = $tab_match[$j];
// on decoupe la chaine et on la passe dans un tableau
$regs = split('[,]', $ch_tab_match);
// idem avec la chaine a traiter
$reception = split('[,]', $tab_matchv[$p]);
// on fait la difference
$non_matches = array_diff($regs, $reception);
// taille de la table
$non_matches_size = sizeof($non_matches);
// taille de la table
$regs_size = sizeof($regs);
// calcul le %
$inaccuracy = 0;
$inaccuracy = round(($non_matches_size/$regs_size)*100);
$accuracy = 100-$inaccuracy;
// echo le resultat en %
if ($accuracy > $valeur_pourcentage AND $accuracy != 100) {
// recupere l'ID
$val = $j-1;
$save_id = $tab_match[$j-1];
$val2 = $p-1;
$tab_matchv_id = $tab_matchv[$p-1];
$delete_id_tb1[$k] = $tab_matchv_id;
$delete_id_tb2[$k] = $save_id;
echo 'Il y a <strong><font color="#ff0000">'.$accuracy.'</font></strong>%
de contenu similaire entre: <br /><strong>ID: '.$tab_matchv_id.'</strong>
>>'.$tab_matchv[$p].'<br /><strong>ID : '.$save_id.'</strong> >> ';
echo ''.$tab_match[$j].'<br /><br /><br />
';
$k++;
}
$j++;
}
$p++;
}
/*******************************************************************************
* Pour supprimer - A vous de voir sur quel table la suppression se fait
* Vous avez les 2 listes d'ID, le reste devrait etre un jeu d'enfant pour vous ;)
***************************************************************************/
echo '<br /><br />Liste des ID de la premiere table<br />';
echo implode(',',array_unique($delete_id_tb1));
echo '<br /><br />Liste des ID de la seconde table<br />';
echo implode(',',array_unique($delete_id_tb2));
$mtime = microtime();
$mtime = explode(" ",$mtime);
$mtime = $mtime[1] + $mtime[0];
$endtime = $mtime;
$totaltime = ($endtime - |