01.
02.
03.
04.
05.
06.
07.
08.
09.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.
121.
122.
123.
124.
125.
126.
127.
128.
129.
130.
131.
132.
133.
134.
135.
136.
137.
138.
139.
140.
141.
142.
143.
144.
145.
146.
147.
148.
149.
150.
151.
152.
153.
154.
155.
156.
157.
158.
159.
160.
161.
162.
163.
164.
165.
166.
167.
168.
169.
170.
171.
172.
173.
174.
175.
176.
177.
178.
179.
180.
181.
182.
183.
184.
185.
186.
187.
188.
189.
190.
191.
192.
193.
194.
195.
196.
197.
198.
199.
200.
201.
202.
203.
204.
205.
206.
207.
208.
209.
210.
211.
212.
213.
214.
215.
216.
217.
218.
219.
220.
221.
222.
223.
224.
225.
226.
227.
228.
229.
230.
231.
232.
233.
234.
235.
236.
237.
238.
239.
240.
241.
242.
243.
244.
245.
246.
247.
248.
249.
250.
251.
252.
253.
254.
255.
256.
257.
258.
259.
260.
261.
262.
263.
264.
265.
266.
267.
268.
269.
270.
271.
272.
273.
274.
275.
276.
277.
278.
279.
280.
281.
282.
283.
284.
285.
286.
287.
288.
289.
290.
291.
292.
293.
294.
295.
296.
297.
298.
299.
300.
301.
302.
303.
304.
305.
306.
|
|
<?php
/******************************************************************************/
/* */
/* __ ____ */
/* ___ / / ___ / __/__ __ _____________ ___ */
/* / _ \/ _ \/ _ \_\ \/ _ \/ // / __/ __/ -_|_-< */
/* / .__/_//_/ .__/___/\___/\_,_/_/ \__/\__/___/ */
/* /_/ /_/ */
/* */
/* */
/******************************************************************************/
/* */
/* Titre : Connaitre massivement les doublons dans une table SQL */
/* */
/* URL : http://www.phpsources.org/scripts333-PHP.htm */
/* Auteur : KOogar */
/* Date édition : 31 Jan 2008 */
/* Website auteur : http://www.koogar.fr */
/* */
/******************************************************************************/
/*******************************************************************************
* connection sql
***************************************************************************/
$connection = mysql_connect("localhost","root","motdepasse");
if ( ! $connection )
die ("connection impossible");
$mabasededonnee="Client";
mysql_select_db($mabasededonnee) or die ("pas de connection");
/*******************************************************************************
* Initialisation
***************************************************************************/
$valeur_pourcentage = 60; // la valeur du pourcentage de mots communs
// le doublon parfait c'est 100% de mots communs!!
// Donc $valeur_pourcentage sera initialiser à 100
// dans ce cas.
// Si vous tolerez 90% de mots communs, mettez 90 ;)
$table_sql = ""; // nom de la table sql
$champ_id = ""; // nom de votre identifiant
$champ_recherche_sql = ""; // nom du champ sur lequel on effectue la
// recherche de similarité
$nbre_elements = 15; // commencer petit, le script est long a executer
// le nombre d'element se trouve dna la premiere boucle de
// l'algo
// si vous souhaitez boucler sur la table entiere,
// remplacer
// $nbre_elements par sizeof($tab_matchv)
/*******************************************************************************
* le temps de déroulement du script
***************************************************************************/
$mtime = microtime();
$mtime = explode(" ",$mtime);
$mtime = $mtime[1] + $mtime[];
$starttime = $mtime;
/*******************************************************************************
* enlève les accents
***************************************************************************/
function TexteSansAccent($texte){
$accent='ÀÁÂÃÄÅàáâãäåÒÓÔÕÖØòóôõöøÈÉÊËéèêëÇçÌÍÎÏìíîïÙÚÛÜùúûüÿÑñ';
$noaccent='AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn';
$texte = strtr($texte,$accent,$noaccent);
return $texte;
}
/*******************************************************************************
* magic_quote
***************************************************************************/
function AuStrip_Slashes($chaine) {
return(get_magic_quotes_gpc() == 1 ? StripSlashes($chaine) : $chaine);}
/*******************************************************************************
* Stop Words !!
***************************************************************************/
$stop_words = array("alors","au","aucuns","aussi","autre","avant","avec","avoir"
,
"bon","car","ce","cela","ces","ceux", "chaque","ci","comme",
"comment","dans","des","du","dedans","dehors","depuis",
"deux",
"devrait","doit", "donc","dos","droite","début","elle",
"elles",
"en","encore","essai","est","et","eu","fait","faites",
"fois",
"font","force","haut","hors","ici","il","ils","je juste",
"la",
"le","les","leur","là","ma",
"maintenant","mais","mes","mine","moins","mon","mot","même",
"ni",
"nommés","notre","nous","nouveaux",
"ou","où","par","parce","parole","pas","personnes","peut",
"peu",
"pièce","plupart","pour","pourquoi",
"quand","que","quel","quelle","quelles","quels","qui","sa",
"sans","ses","seulement","si","sien","son",
"sont","sous","soyez sujet","sur","ta","tandis","tellement",
"tels","tes","ton","tous","tout","trop",
"très","tu","valeur","voie","voient","vont","votre","vous",
"vu","ça","étaient","état","étions","été","être",
"un","deux","trois","quatre","cinq","six","sept","huit",
"neuf","dix",
"0","1","2","3","4","5","6","7","8","9","10",
"avec","chez","par","dans","des","en","de","une","votre",
"meilleurs","entre","entres",
"depuis","alors","ne","pas","du","meme",
"ou","nom","seuls","acceptes","ayant",
"mais","ou","et","donc","or","ni","car",
"vos","votre","mes","mien","mien","tien","tiens","tout",
"toute","toutes",
"que","quoi","qui","comment","peu","peut","pis","puis","pas"
,
"chaque","chacun","chacune",
"son","ses","au","aux","se","sur","ce","ceux","cette","ca",
"ci","ceci","cela","aussi","pour",
"petit","grand","moyen","large","haut","bas","milieu",
"droite",
"gauche","centre",
"dit","etre","leur","leurs",
"plus","moin","moins",
"es","est","sont","son","va","suis","ai","viens",
"a","b","c","d","e","f","g","h","i","l","m","n","o","p","q",
"r","s","t","u","v","w","x","y","z"
);
/*******************************************************************************
* Set le temps d'excecution
***************************************************************************/
set_time_limit();
/*******************************************************************************
* Premiere requete -place tout les champs dans un tableau
***************************************************************************/
$qv = mysql_query("SELECT $champ_id,$champ_recherche_sql FROM $table_sql");
$p=;
$regs_split = array();
$tab_matchv = array();
while ($rv = mysql_fetch_array($qv)) {
$tab_matchv[$p] .= $rv[$champ_id];
$p++;
// Place tous les mots d'une chaine dans un tableau
$ch = $rv[$champ_recherche_sql];
// enleve les slashs
$ch = AuStrip_Slashes($ch);
// enleve les accent
$ch = TexteSansAccent($ch);// enleve les accent
// enleve les apostrophes
$ch = str_replace("'"," ",$ch);
// passe le texte en minuscule
$ch = strtolower($ch); // passe minuscule
$regs_split = split("[^[:alpha:]]+", $ch);
// enleve les mots bannis en faisant le difference des 2 tableaux
$regs_split2 = array_diff($regs_split,$stop_words);
//print_r($regs_split2);
$rv = implode(',',$regs_split2);
//echo $rv; echo '<br />';
$tab_matchv[$p] .= $rv;
$p++;
}
/*******************************************************************************
* seconde requete - place tout les champs dans un tableau
***************************************************************************/
$q = mysql_query("SELECT $champ_id,$champ_recherche_sql FROM $table_sql");
$i=;
$regs1_split = array();
$tab_match = array();
while ($r = mysql_fetch_array($q)) {
$tab_match[$i] .= $r[$champ_id];
$i++;
// Place tous les mots d'une chaine dans un tableau
$ch = $r[$champ_recherche_sql];
// enleve les slashs
$ch = AuStrip_Slashes($ch);
// enleve les accent
$ch = TexteSansAccent($ch);// enleve les accent
// enleve les apostrophes
$ch = str_replace("'"," ",$ch);
// passe le texte en minuscule
$ch = strtolower($ch); // passe minuscule
$regs1_split = split("[^[:alpha:]]+", $ch);
// enleve les mots bannis en faisant le difference des 2 tableaux
$regs1_split2 = array_diff($regs1_split,$stop_words);
$rv = implode(',',$regs1_split2);
$tab_match[$i] .= $rv;
$i++;
}
/*******************************************************************************
* Sortie ecrans
***************************************************************************/
echo "<strong>Nombre de champs : ".round(sizeof($tab_match)/2)."<br /><br" .
" /></strong>";
echo "<strong>Verifie les $nbre_elements premiers éléments<br /><br />";
echo "<strong>Tolérance maximum de mots communs en pourcentage :</strong>
".$valeur_pourcentage."%</strong><br /><br />";
/*******************************************************************************
* l'algo
***************************************************************************/
$p = ;
$k = ;
$delete_id_tb1 = array();
$delete_id_tb2 = array();
while ($p < $nbre_elements)
{
$p++;
$j = ;
while ($j < sizeof($tab_match))
{
$j++;
$regs = array();
$reception = array();
// on prend un champ du tableau
$ch_tab_match = $tab_match[$j];
// on decoupe la chaine et on la passe dans un tableau
$regs = split('[,]', $ch_tab_match);
// idem avec la chaine a traiter
$reception = split('[,]', $tab_matchv[$p]);
// on fait la difference
$non_matches = array_diff($regs, $reception);
// taille de la table
$non_matches_size = sizeof($non_matches);
// taille de la table
$regs_size = sizeof($regs);
// calcul le %
$inaccuracy = ;
$inaccuracy = round(($non_matches_size/$regs_size)*100);
$accuracy = 100-$inaccuracy;
// echo le resultat en %
if ($accuracy > $valeur_pourcentage AND $accuracy != 100) {
// recupere l'ID
$val = $j-1;
$save_id = $tab_match[$j-1];
$val2 = $p-1;
$tab_matchv_id = $tab_matchv[$p-1];
$delete_id_tb1[$k] = $tab_matchv_id;
$delete_id_tb2[$k] = $save_id;
echo 'Il y a <strong><font color="#ff0000">'.$accuracy.'</font></strong>%
de contenu similaire entre: <br /><strong>ID: '.$tab_matchv_id.
'</strong>
>>'.$tab_matchv[$p].'<br /><strong>ID : '.$save_id.'</strong> >> ';
echo ''.$tab_match[$j].'<br /><br /><br />
';
$k++;
}
$j++;
}
$p++;
}
/*******************************************************************************
* Pour supprimer - A vous de voir sur quel table la suppression se fait
* Vous avez les 2 listes d'ID, le reste devrait etre un jeu d'enfant pour
vous ;)
***************************************************************************/
echo '<br /><br />Liste des ID de la premiere table<br />';
echo implode(',',array_unique($delete_id_tb1));
echo '<br /><br />Liste des ID de la seconde table<br />';
echo implode(',',array_unique($delete_id_tb2));
$mtime = microtime();
$mtime = explode(" ",$mtime);
$mtime = $mtime[1] + $mtime[];
$endtime = $mtime;
$totaltime = ($endtime - $starttime);
echo '<br /><br /><strong>Page générée en ',number_format($totaltime,4,',',''),
' s</strong>';
/*******************************************************************************
* Fin
***************************************************************************/
mysql_close($connection);
?>
|