Printer friendly version

StripHtml


C# function that converts HTML formatted text to clean text.

 

 

 

private string StripHtml(string sHtml)
{

StringBuilder sb = new StringBuilder();
int bodyStart = sHtml.ToLower().IndexOf("<body");
int bodyEnd = sHtml.ToLower().IndexOf("</body>");
if ((bodyStart >= 0) && (bodyEnd > bodyStart))
{

bodyStart = sHtml.IndexOf(

 

">", bodyStart);
if (bodyStart >= 0)
{

bodyStart++;
sHtml = sHtml.Substring(bodyStart, bodyEnd - bodyStart);

}

}

 

 

int txtStart = 0;
int txtEnd = sHtml.IndexOf("<", txtStart);
if (txtEnd < txtStart) return sHtml;
while (txtStart <= txtEnd)
{

if (txtEnd > txtStart + 1) //'Must be something between
{
if (sHtml[txtStart] == '>') txtStart++;
sb.Append(sHtml.Substring(txtStart, txtEnd - txtStart));
}

txtStart = sHtml.IndexOf(

 

">", txtEnd);
if (txtStart > 0)
{
txtEnd = sHtml.IndexOf(
"<", txtStart);
}
else
{
txtEnd = txtStart - 1;
}
}

// This part is optional, removes all CR's and replaces non breaking spaces, there are actually loads of other &xxxx;
// elements that are not taken care of here.

 


string
res = sb.ToString();
res = res.Replace(
"&nbsp;", " ");
res = res.Replace((
char)13+"", "");
res = res.Replace((
char)10+"", "");
res = res.Replace((
char)9+"", "");
// --------------
return res.Trim();
}

Tip a friend  Go back