-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParser.cs
137 lines (131 loc) · 6.78 KB
/
Parser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/**
* Created by Abdessalam BENHARIRA
* Project Name : Web Parser
*/
using System;
//List
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
//Regex
using System.Text.RegularExpressions;
namespace WebParser {
class Parser {
public string htmlCode;
//Le chemin
string path = Environment.GetFolderPath (Environment.SpecialFolder.Desktop);
// Methode pour recuperer le code html de la page
public void RecupererData () {
// https://msdn.microsoft.com/fr-fr/library/system.net.webclient(v=vs.110).aspx
using (WebClient client = new WebClient ()) {
htmlCode = client.DownloadString ("http://abdessalam-benharira.me");
using (StreamWriter swriter = new StreamWriter (path + "/maPage.txt", true)) {
swriter.WriteLine (htmlCode);
}
}
}
// Methode pour changer la balise <title> de la page
public void ChangerTitre () {
string htmlTextT;
using (StreamReader reader = new StreamReader (path + "/maPage.txt")) {
while ((htmlTextT = reader.ReadLine ()) != null) {
using (StreamWriter swriter = File.AppendText (path + "/test.txt")) {
htmlTextT = Regex.Replace (htmlTextT, @"<title>\s*(.+?)\s*</title>", "<title>Projet Web Parser CSharp</title>");
swriter.WriteLine (htmlTextT);
}
}
}
}
// Methode pour supprimer toutes les balises <script> de la page
public void SupprimerScripts () {
string htmlTextS;
using (StreamReader reader = new StreamReader (path + "/test.txt")) {
while ((htmlTextS = reader.ReadLine ()) != null) {
using (StreamWriter swriter = File.AppendText (path + "/test1.txt")) {
htmlTextS = Regex.Replace (htmlTextS, @"<script[^>]*>[\s\S]*?</script>", String.Empty);
swriter.WriteLine (htmlTextS);
}
}
}
// je supprime le fichier passé en lecture
if (File.Exists (path + "/test.txt")) {
File.Delete (path + "/test.txt");
}
}
// Methode pour changer la src de <img>
public void ChangerSrcImg () {
string htmlTextI;
using (StreamReader reader = new StreamReader (path + "/test1.txt")) {
while ((htmlTextI = reader.ReadLine ()) != null) {
using (StreamWriter swriter = File.AppendText (path + "/test2.txt")) {
// Soit on remplace toute la balise
htmlTextI = Regex.Replace (htmlTextI, "<img.+?src=[\"'](.+?)[\"'].*?>", " <img src=\"http://abdessalam-benharira.me/blog/wp-content/uploads/2017/09/logo.png\">");
// soit on remplace seulement le src vu que la balise <script> contient src et qu'on les a déja supprimé ou on utilise
// htmlTextI = Regex.Replace (htmlTextI, "src=\"([^\"]*)\"" ,"src=\"http://abdessalam-benharira.me/blog/wp-content/uploads/2017/09/logo.png\"");
swriter.WriteLine (htmlTextI);
}
}
}
// je supprime le fichier passé en lecture
if (File.Exists (path + "/test1.txt")) {
File.Delete (path + "/test1.txt");
}
}
public void TexteEnGras () {
string htmlTextB;
using (StreamReader reader = new StreamReader (path + "/test2.txt")) {
while ((htmlTextB = reader.ReadLine ()) != null) {
using (StreamWriter swriter = File.AppendText (path + "/maPageFinale.txt")) {
/*en mettant <style> * { font-weight:bold !important; } </style> avant la fermeture du head
et en important (ecrase le style de font appliqué avant) tout le texte sera en gras
*/
htmlTextB = Regex.Replace (htmlTextB, "</head>", "\t" + "<style> * { font-weight:bold !important; } </style>" + "\n" + "</head>");
swriter.WriteLine (htmlTextB);
}
}
}
// je supprime le fichier passé en lecture
if (File.Exists (path + "/test2.txt")) {
File.Delete (path + "/test2.txt");
}
}
// Cette methode permet de recuperer le texte html sans balise
public void NettoyageHtml () {
string htmlTextC;
List<string> SplitText = new List<string> ();
using (StreamReader reader = new StreamReader (path + "/maPageFinale.txt")) {
while ((htmlTextC = reader.ReadLine ()) != null) {
using (StreamWriter swriter = File.AppendText (path + "/texteSplit.txt")) {
htmlTextC = Regex.Replace (htmlTextC, "<[^>].+?>", string.Empty);
//htmlTextC = Regex.Replace (htmlText, @"[^\w\s]", string.Empty);
// supprimer les espaces
htmlTextC = Regex.Replace (htmlTextC, @"\s+", " ");
htmlTextC = htmlTextC.Replace (Environment.NewLine + Environment.NewLine, Environment.NewLine);
// je mets tous les separateurs dans un tableau
char[] separateurs = { ',', '.', '!', '?', ';', ':', '/', '\'', '’', '(', ')', ' ' };
string[] words = htmlTextC.Split (separateurs);
// je mets le tableau dans une liste
SplitText = words.ToList<string> ();
// je supprime tous les element vides de la liste
SplitText = SplitText.Where (s => !string.IsNullOrWhiteSpace (s)).Distinct ().ToList ();
foreach (string item in SplitText) {
swriter.WriteLine (item);
}
}
}
}
}
// Cette methode permet de recuperer le mot le plus repeté
public void MotFrequent () {
List<string> lignes = File.ReadLines (path + "/texteSplit.txt").ToList ();
/*https://docs.microsoft.com/fr-fr/dotnet/csharp/programming-guide/classes-and-structs/anonymous-types
https://docs.microsoft.com/fr-fr/dotnet/framework/data/adonet/ef/language-reference/initialization-expressions
*/
string motFrequent = lignes.GroupBy (s => s).OrderByDescending (s => s.Count ()).First ().Key;
using (StreamWriter swriter = File.AppendText (path + "/MotFrequent.txt")) {
swriter.WriteLine (motFrequent);
}
}
}
}