Send a suggestion!

We're building a brand new version of the site, and we'd love to hear your ideas

Members

Technology Zones

IBM Learning Center

Articles

Hosted By

MaximumASP

Info

Rated
Read 49,570 times

Contents

Downloads

Related Categories

Creating a Generic Site-To-Rss Tool - Building the Generic SiteToRSS Class

RoyOsherove

Building the Generic SiteToRSS Class

Our class should have several public properties representing the various RSS feed properties (description, generator, and so on). It should also be able to download a site from the Web and write an RSS feed into a file or just return it as a string. I'll spare you the entire code of the class, but I'll refer here to the less trivial methods inside it. Here's the basic layout:

Public Class RSSCreator
    Public Sub New(ByVal Url As String, ByVal FileName As String)
    End Sub

    Public Sub New(ByVal Url As String)
    End Sub

    Public Property UrlToParse() As String
    End Property

    ''' <summary>
    '''    the file to which the RSS feed will be written to
    ''' </summary>
    Public Property FileName() As String
    End Property

    ''' <summary>
    '''    returns a string containing the RSS feed xml
    ''' </summary>
    Public Overloads Function GetRss() As String
        Dim ms As New MemoryStream
        Dim sr As New StreamWriter(ms, Encoding.UTF8)

        'We send "false" to signal the method to not close the stream automatically in the end
        'we need to close the stream manually so we can get its length
        WriteRSS(sr, False)
        Try

            ''we need to explicitly state the length
            'of the buffer we want
            'otherwise we'll get a string as long as ms.capacity
            'instead of the actual length of the string inside
            Dim iLen As Long = ms.Length
            Dim retval As String = _
                Encoding.UTF8.GetString(ms.GetBuffer(), 0, iLen)

            sr.Close()
            Return retval

        Catch ex As Exception
            Return ex.ToString()

        End Try

    End Function

    ''' <summary>
    '''    writes the resolved RSS feed to a file
    ''' </summary>
    Public Overloads Function WriteRSS() As String
        Dim writer As New StreamWriter(FileName, False, Encoding.UTF8)
        Return WriteRSS(writer, True)
    End Function

    ''' <summary>
    '''    Writes the resolved RSS feed to a text writer
    '''    and returns the text that was written (if it was written to a file)
    ''' </summary>
    Public Overloads Function WriteRSS(ByVal txWriter As TextWriter,
    ByVal closeAfterFinish As Boolean) As String

    End Function

    ''' <summary>
    '''    writes the beggining of the XML document
    ''' </summary>
    Private Sub WritePrologue(ByVal writer As XmlTextWriter)
        With writer
            .WriteStartDocument()
            .WriteComment("RSS generated by SiteToRSS generator at " +
        DateTime.Now.ToString("r"))
            .WriteStartElement("rss")
            .WriteAttributeString("version", "2.0")
            .WriteAttributeString("xmlns:blogChannel",
        "http://backend.userland.com/blogChannelModule")

            .WriteStartElement("channel", "")
            .WriteElementString("title", RSSFeedName)
            .WriteElementString("link", RssFeedLink)
            .WriteElementString("description", RssFeedDescription)
            .WriteElementString("copyright", RssFeedCopyright)
            .WriteElementString("generator",
        "SiteParser RSS engine 1.0 by Roy Osherove")
        End With
    End Sub


    ''adds a post to the RSS feed
    Private Sub AddRssItem(ByVal writer As XmlTextWriter,
ByVal title As String, ByVal link As String,
ByVal description As String,
ByVal pubDate As String, ByVal subject As String)

        writer.WriteStartElement("item")
        writer.WriteElementString("title", title)
        writer.WriteElementString("link", link)

        'write the description as CDATA because
        'it might contain invalid chars
        writer.WriteStartElement("description")
        writer.WriteCData(description)
        writer.WriteEndElement()

        writer.WriteElementString("category", subject)
        writer.WriteElementString("pubDate", pubDate)
        writer.WriteEndElement()

    End Sub

    ''' <summary>
    '''    generates a new regular expression
    '''    and retrives the GTML from thw web
    ''' </summary>
    Private Sub ParseHtml()
        m_FoundRegex = New Regex(RegexPattern)
        GetHtml()

    End Sub


    ''' <summary>
    '''    retrieves the web page form the web
    ''' </summary>
    Private Sub GetHtml()
    End Sub

    Public Property DownloadedHtml() As String
    End Property

    ''' <summary>
    '''    this prefix will be prepended to every news item link
    ''' </summary>
    Public Property LinksPrefix() As String
    End Property

    Public Property RegexPattern() As String
        Get
            Return m_strRegexPattern
        End Get
        Set(ByVal Value As String)
            'important!
            'We need to verify this or we won't have a viable feed
            VerifyPatternIsGood(Value)
            m_strRegexPattern = Value
        End Set
    End Property

    ''' <summary>
    '''    verify that the required group names appear
    '''    in the regular expression passed to the parsing engine
    ''' </summary>
    Private Sub VerifyPatternIsGood(ByVal pattern As String)
    End Sub

    ''' <summary>
    '''    usees a reges to determine if a certain named group
    '''    exists within another regex string.
    '''    If not, an exception is thrown.
    ''' </summary>
    Private Sub VerifyPatternIsGood(ByVal pattern As String,
    ByVal NeededGroupName As String)
    End Sub

    Public Property RssFeedDescription() As String
    End Property

    Public Property RssFeedLink() As String
    End Property

    Public Property RSSFeedName() As String
    End Property


    Public Property RssFeedCopyright() As String
    End Property

End Class

The class itself is very simple to use. You instantiate it with a URL and possibly a file name and then set its properties, which reflect the feed properties. Several actions need special attention.

Verifying the Existence of Capture Groups in a Pattern

When setting the RegexPattern property, the class runs an internal check to verify that the entered Regex contains all the group names that are expected in order to successfully write the RSS feed. To this end, it calls the “VerifyPatternIsGood()” method, which internally calls an overload of itself with each required group name. This overload actually runs a match on the expression using its own regular expression to check that the passed group name is indeed inside the pattern text, kind of like performing brain surgery on yourself.

Here is the code for these two methods.

''' <summary>
    '''    verify that the required group names appear
    '''    in the regular expression passed to the parsing engine
    ''' </summary>
    Private Sub VerifyPatternIsGood(ByVal pattern As String)
        Try
            VerifyPatternIsGood(pattern, "description")
            VerifyPatternIsGood(pattern, "title")
            VerifyPatternIsGood(pattern, "link")
            VerifyPatternIsGood(pattern, "category")
            VerifyPatternIsGood(pattern, "pubDate")
        Catch ex As Exception
            Throw ex
        End Try
    End Sub

    ''' <summary>
    '''    usees a reges to determine if a certain named group
    '''    exists within another regex string.
    '''    If not, an exception is thrown.
    ''' </summary>
    Private Sub VerifyPatternIsGood(ByVal pattern As String,
    ByVal NeededGroupName As String)
        Dim VerifyRegex As String = "\?<" & NeededGroupName & ">"

        If Not Regex.Match(pattern, VerifyRegex).Success Then
            Throw New ArgumentException(NeededGroupName &
        " group missing form pattern")
        End If

    End Sub

Retrieving the Site Using the WebClient Class

The class is responsible of retrieving the site's HTML content from the Web. To that end, it uses the WebClient class, which enables us to download Web pages, download or upload files, post requests, and lots of other cool stuff.

The method that does this work is the GetHtml() method:

    ''' <summary>
    '''    retrieves the web page form the web
    ''' </summary>
    Private Sub GetHtml()
        Try
            Dim req As New WebClient
            Dim reader As New StreamReader(req.OpenRead(UrlToParse))
            Me.DownloadedHtml = reader.ReadToEnd()

            reader.Close()
            req.Dispose()

        Catch oE As System.Exception
        End Try
    End Sub

Roy Osherove has spent the past 6+ years developing data driven applications for various companies in Israel. He's acquired several MCP titles, written a number of articles on various .NET topics, most of which can be found on his weblog, and loves discovering new things everyday. Roy is also the author of the Feedable service and of the free regular expression tool, The Regulator.

Comments

  • MakeRSS: How to change to a different URL?

    Posted by Qmark on 08 Apr 2005

    I'm having trouble changing:
    http://www.dotnetwire.com (the default URL in MakeRSS)
    to a different URL.


    1) Uploaded the MakeRSS \bin files to the ASP.net server's BIN folder.
    2) Uplo...

  • Problem with RegEx

    Posted by anker on 13 Aug 2004

    Well I have just posted a topic and saw you're complication. I am new at RegEx, but I think it has something to do with greediness. RegEx tries to match as much as possible, unless you specify otherwi...

  • Special Characters

    Posted by anker on 13 Aug 2004

    When you grab the html from another homepage it converts the text to UTF8, which is fine.
    However in Denmark and a lot of other countries we use special Characters like Æ Ø Å, which is replaced by me...

  • Problem with RE

    Posted by desp on 29 Feb 2004

    As you mentioned that ? before the “"\s*target="newwindow"” section is used to to catch the first occurence but why i m having following problem

    Pattern = .*)?(">)
    String =