Skip to content

Commit 90db59a

Browse files
committed
🎉 Initial commit.
0 parents  commit 90db59a

File tree

9 files changed

+1191
-0
lines changed

9 files changed

+1191
-0
lines changed

.gitignore

Lines changed: 846 additions & 0 deletions
Large diffs are not rendered by default.

.idea/.idea.TelegramChatParser/.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

TelegramChatParser.sln

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TelegramChatParser", "TelegramChatParser\TelegramChatParser.csproj", "{1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD}"
4+
EndProject
5+
Global
6+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
7+
Debug|Any CPU = Debug|Any CPU
8+
Release|Any CPU = Release|Any CPU
9+
EndGlobalSection
10+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
11+
{1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
12+
{1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD}.Debug|Any CPU.Build.0 = Debug|Any CPU
13+
{1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD}.Release|Any CPU.ActiveCfg = Release|Any CPU
14+
{1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD}.Release|Any CPU.Build.0 = Release|Any CPU
15+
EndGlobalSection
16+
EndGlobal

TelegramChatParser/Message.cs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
using System;
2+
3+
namespace E2.Utils
4+
{
5+
public class Message
6+
{
7+
public int Id { get; set; }
8+
public string Content { get; set; }
9+
public string Author { get; set; }
10+
public DateTime DateTime { get; set; }
11+
public int? ReplyMessageId { get; set; }
12+
13+
public Message(string content, string author, string dateTime, int? replyMessageId, int id)
14+
{
15+
string[] date = dateTime
16+
.Trim()
17+
.Split(' ')[0]
18+
.Split('.');
19+
20+
string[] time = dateTime
21+
.Trim()
22+
.Split(' ')[1]
23+
.Split(':');
24+
25+
DateTime = new DateTime(
26+
int.Parse(date[2]),
27+
int.Parse(date[1]),
28+
int.Parse(date[0]),
29+
int.Parse(time[0]),
30+
int.Parse(time[1]),
31+
int.Parse(time[2]));
32+
Id = id;
33+
Content = content;
34+
Author = author;
35+
ReplyMessageId = replyMessageId;
36+
}
37+
38+
public override string ToString() =>
39+
$"{nameof(Id)}: {Id}, {nameof(ReplyMessageId)}: {ReplyMessageId}, {nameof(Author)}: {Author}, {nameof(DateTime)}: {DateTime}, {nameof(Content)}: {Content}";
40+
41+
42+
public string ToCsv() =>
43+
$"{Id},{ReplyMessageId},{Author},{DateTime},{Content?.Replace('\n', ' ')}";
44+
}
45+
}

TelegramChatParser/Program.cs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
using System;
2+
using CommandLine;
3+
using E2.Utils;
4+
5+
namespace TelegramChatParser
6+
{
7+
public class Program
8+
{
9+
public static void Main(string[] args)
10+
{
11+
try
12+
{
13+
ParserResult<TgParser> parserResult =
14+
Parser
15+
.Default
16+
.ParseArguments<TgParser>(args)
17+
.WithParsed<TgParser>(o =>
18+
{
19+
TgParser tgParser = new TgParser(o.InputHtmlPath, o.CsvFilePath, o.Verbose, o.Append);
20+
21+
tgParser.CreateCsv();
22+
});
23+
}
24+
catch (Exception e)
25+
{
26+
Console.BackgroundColor = ConsoleColor.Red;
27+
Console.WriteLine($"Exception thrown: \nMessage -> {e.Message}\nStackTrace -> {e.StackTrace}");
28+
Console.BackgroundColor = ConsoleColor.Black;
29+
}
30+
}
31+
}
32+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
using System.Reflection;
2+
using System.Runtime.InteropServices;
3+
4+
// General Information about an assembly is controlled through the following
5+
// set of attributes. Change these attribute values to modify the information
6+
// associated with an assembly.
7+
[assembly: AssemblyTitle("TelegramChatParser")]
8+
[assembly: AssemblyDescription("")]
9+
[assembly: AssemblyConfiguration("")]
10+
[assembly: AssemblyCompany("")]
11+
[assembly: AssemblyProduct("TelegramChatParser")]
12+
[assembly: AssemblyCopyright("Copyright © 2019")]
13+
[assembly: AssemblyTrademark("")]
14+
[assembly: AssemblyCulture("")]
15+
16+
// Setting ComVisible to false makes the types in this assembly not visible
17+
// to COM components. If you need to access a type in this assembly from
18+
// COM, set the ComVisible attribute to true on that type.
19+
[assembly: ComVisible(false)]
20+
21+
// The following GUID is for the ID of the typelib if this project is exposed to COM
22+
[assembly: Guid("1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD")]
23+
24+
// Version information for an assembly consists of the following four values:
25+
//
26+
// Major Version
27+
// Minor Version
28+
// Build Number
29+
// Revision
30+
//
31+
// You can specify all the values or you can default the Build and Revision Numbers
32+
// by using the '*' as shown below:
33+
// [assembly: AssemblyVersion("1.0.*")]
34+
[assembly: AssemblyVersion("1.0.0.0")]
35+
[assembly: AssemblyFileVersion("1.0.0.0")]
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
4+
<PropertyGroup>
5+
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
6+
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
7+
<ProjectGuid>{1E6DDAC1-C1D5-4372-BEA6-06A0C4DC63FD}</ProjectGuid>
8+
<OutputType>Exe</OutputType>
9+
<AppDesignerFolder>Properties</AppDesignerFolder>
10+
<RootNamespace>TelegramChatParser</RootNamespace>
11+
<AssemblyName>TelegramChatParser</AssemblyName>
12+
<TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
13+
<FileAlignment>512</FileAlignment>
14+
</PropertyGroup>
15+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
16+
<PlatformTarget>AnyCPU</PlatformTarget>
17+
<DebugSymbols>true</DebugSymbols>
18+
<DebugType>full</DebugType>
19+
<Optimize>false</Optimize>
20+
<OutputPath>bin\Debug\</OutputPath>
21+
<DefineConstants>DEBUG;TRACE</DefineConstants>
22+
<ErrorReport>prompt</ErrorReport>
23+
<WarningLevel>4</WarningLevel>
24+
</PropertyGroup>
25+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
26+
<PlatformTarget>AnyCPU</PlatformTarget>
27+
<DebugType>pdbonly</DebugType>
28+
<Optimize>true</Optimize>
29+
<OutputPath>bin\Release\</OutputPath>
30+
<DefineConstants>TRACE</DefineConstants>
31+
<ErrorReport>prompt</ErrorReport>
32+
<WarningLevel>4</WarningLevel>
33+
</PropertyGroup>
34+
<ItemGroup>
35+
<Reference Include="CommandLine, Version=2.5.0.0, Culture=neutral, PublicKeyToken=5a870481e358d379">
36+
<HintPath>..\packages\CommandLineParser.2.5.0\lib\net461\CommandLine.dll</HintPath>
37+
<Private>True</Private>
38+
</Reference>
39+
<Reference Include="HtmlAgilityPack, Version=1.4.9.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a">
40+
<HintPath>..\packages\HtmlAgilityPack.CssSelectors.1.0.2\lib\net45\HtmlAgilityPack.dll</HintPath>
41+
<Private>True</Private>
42+
</Reference>
43+
<Reference Include="HtmlAgilityPack.CssSelectors, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null">
44+
<HintPath>..\packages\HtmlAgilityPack.CssSelectors.1.0.2\lib\net45\HtmlAgilityPack.CssSelectors.dll</HintPath>
45+
<Private>True</Private>
46+
</Reference>
47+
<Reference Include="System" />
48+
<Reference Include="System.Core" />
49+
<Reference Include="System.Data" />
50+
<Reference Include="System.Xml" />
51+
</ItemGroup>
52+
<ItemGroup>
53+
<Compile Include="Message.cs" />
54+
<Compile Include="TgParser.cs" />
55+
<Compile Include="Program.cs" />
56+
<Compile Include="Properties\AssemblyInfo.cs" />
57+
</ItemGroup>
58+
<ItemGroup>
59+
<None Include="packages.config" />
60+
</ItemGroup>
61+
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
62+
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
63+
Other similar extension points exist, see Microsoft.Common.targets.
64+
<Target Name="BeforeBuild">
65+
</Target>
66+
<Target Name="AfterBuild">
67+
</Target>
68+
-->
69+
70+
</Project>

TelegramChatParser/TgParser.cs

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Globalization;
4+
using System.IO;
5+
using System.Linq;
6+
using System.Text;
7+
using CommandLine;
8+
using HtmlAgilityPack;
9+
10+
namespace E2.Utils
11+
{
12+
public class TgParser
13+
{
14+
private const string MESSAGES_QUERY_SELECTOR = "div.default[id^='message']";
15+
private const string MESSAGE_TEXT_QUERY_SELECTOR = "div.body > div.text";
16+
private const string MESSAGE_DATE_QUERY_SELECTOR = "div.body > div.pull_right.date.details";
17+
private const string MESSAGE_FROM_NAME_QUERY_SELECTOR = "div.body > div.from_name";
18+
private const string MESSAGE_REPLY_QUERY_SELECTOR = "div.body > div.reply_to.details > a";
19+
20+
21+
[Option(shortName: 'v', longName: "verbose", Default = false,
22+
HelpText = "Set output to verbose messages.")]
23+
public bool Verbose { get; set; }
24+
25+
[Option(shortName: 'i', longName: "html-path", Required = true, HelpText = "Set html file path to parse.")]
26+
public string InputHtmlPath { get; set; }
27+
28+
[Option(shortName: 'o', longName: "csv-path", Required = true,
29+
HelpText = "Set csv file path with it's name to save. ex: C:\\Users\\Ali\\Desktop\\chats.csv")]
30+
public string CsvFilePath { get; set; }
31+
32+
[Option(shortName: 'a', longName: "append", HelpText = "Append to existing csv file.")]
33+
public bool Append { get; set; }
34+
35+
36+
public TgParser()
37+
{
38+
}
39+
40+
public TgParser(string inputHtmlPath, string csvFilePath, bool verbose = false, bool append = false)
41+
{
42+
if (Append == true && !File.Exists(CsvFilePath))
43+
{
44+
Console.BackgroundColor = ConsoleColor.Yellow;
45+
Console.WriteLine("Warning: Csv file not exists, Creating new file...");
46+
Console.BackgroundColor = ConsoleColor.Black;
47+
}
48+
49+
if (!File.Exists(inputHtmlPath))
50+
{
51+
Console.BackgroundColor = ConsoleColor.Red;
52+
Console.WriteLine("Error: inputHtml dose not exists! aborting...");
53+
Console.BackgroundColor = ConsoleColor.Black;
54+
}
55+
56+
Verbose = verbose;
57+
InputHtmlPath = inputHtmlPath;
58+
CsvFilePath = csvFilePath;
59+
Append = append;
60+
}
61+
62+
private List<Message> GetMessages()
63+
{
64+
var document = new HtmlDocument();
65+
document.Load(InputHtmlPath, encoding: Encoding.UTF8);
66+
67+
var messageNodes = document.QuerySelectorAll(MESSAGES_QUERY_SELECTOR);
68+
69+
Stack<Message> messagesStack = new Stack<Message>();
70+
71+
72+
try
73+
{
74+
foreach (var messageNode in messageNodes)
75+
{
76+
int messageId = int.Parse(
77+
messageNode
78+
.Attributes["id"]
79+
.Value.Substring(7));
80+
81+
// text can be null
82+
string text = messageNode
83+
.QuerySelector(MESSAGE_TEXT_QUERY_SELECTOR)
84+
?.InnerText.Trim();
85+
86+
string date = messageNode
87+
.QuerySelector(MESSAGE_DATE_QUERY_SELECTOR)
88+
.Attributes["title"].Value;
89+
90+
string author = messageNode
91+
.QuerySelector(MESSAGE_FROM_NAME_QUERY_SELECTOR)
92+
?.InnerText.Trim();
93+
94+
int? replyMessageId = null;
95+
if (messageNode.QuerySelector(MESSAGE_REPLY_QUERY_SELECTOR) != null)
96+
{
97+
replyMessageId = int.Parse(messageNode
98+
.QuerySelector(MESSAGE_REPLY_QUERY_SELECTOR)
99+
?.Attributes["href"].Value.Substring(14) ??
100+
throw new Exception());
101+
}
102+
103+
messagesStack.Push(author != null
104+
? new Message(text, author, date, replyMessageId, messageId)
105+
: new Message(text, messagesStack.Peek().Author, date, replyMessageId, messageId));
106+
}
107+
}
108+
catch (Exception e)
109+
{
110+
Console.BackgroundColor = ConsoleColor.Red;
111+
Console.WriteLine($"Exception thrown: \nMessage -> {e.Message}\nStackTrace -> {e.StackTrace}");
112+
Console.BackgroundColor = ConsoleColor.Black;
113+
}
114+
115+
return messagesStack.ToList();
116+
}
117+
118+
119+
public void CreateCsv()
120+
{
121+
List<Message> messages = GetMessages();
122+
123+
using (StreamWriter sw = new StreamWriter(CsvFilePath, Append, encoding: Encoding.UTF8))
124+
{
125+
sw.WriteLine("MessageId,ReplyMessageId,Author,DateTime,Content");
126+
foreach (Message message in messages)
127+
{
128+
if (Verbose)
129+
Console.WriteLine(message.ToString());
130+
sw.WriteLine(message.ToCsv());
131+
}
132+
}
133+
}
134+
}
135+
}

TelegramChatParser/packages.config

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<packages>
3+
<package id="CommandLineParser" version="2.5.0" targetFramework="net461" />
4+
<package id="HtmlAgilityPack" version="1.11.7" targetFramework="net461" />
5+
<package id="HtmlAgilityPack.CssSelectors" version="1.0.2" targetFramework="net461" />
6+
</packages>

0 commit comments

Comments
 (0)