나는 웹 스파이더 응용 프로그램을 모든 소스 코드에서 상속 받았다. 일반 팜플렛 스타일 웹 사이트 (15 페이지 미만)에서는 소프트웨어가 완벽하게 작동합니다.System.StackOverflowException LinqToHtml로 발생
(20,000 페이지 이상) 소프트웨어는 아래 코드에 표시된 줄에 StackOverflowException을 발생시킵니다.
재귀를 유용하게 사용하지 않는 것 같지만 불행히도 LinqToHtml (SuperStarCoders) 라이브러리가 사용되는 것은 지원되지 않습니다.
Partial Public Class Links
Public Property SiteUrl As String
Public Property SiteTitle As String
Public Property Site As String
End Class
다른 2 개 목록은 : 위에 LinkList 변수 (Typing.Links의)리스트가
Private Function ExportXml(Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing) As Boolean
Dim _L = PopulateSEOList(_Worker)
Try
Dim _TmpStr As New Text.StringBuilder
Dim _X As New XDocument, _ct As Long = 0, _Elements As Typing.SEO.Elements = Nothing
ReportProgress(0, _Worker)
With _TmpStr
.Append("<?xml version=""1.0"" encoding=""UTF-8""?>")
.Append("<o7th.Web.Design.Web.Spider>")
For i As Long = 0 To _L.Count - 1
_ct += 1
.Append(" <Page>")
.Append(" <Link>" & XmlEscape(_L(i).Link) & "</Link>")
.Append(" <Title>" & XmlEscape(_L(i).Title) & "</Title>")
.Append(" <Keywords>" & XmlEscape(_L(i).Keywords) & "</Keywords>")
.Append(" <Description>" & XmlEscape(_L(i).Description) & "</Description>")
.Append(" <Elements>")
_Elements = _L(i).ContentElements
If _Elements IsNot Nothing Then
If _Elements.H1 IsNot Nothing Then
.Append(<H1>
<%= (From n In _Elements.H1.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H1>)
End If
If _Elements.H2 IsNot Nothing Then
.Append(<H2>
<%= (From n In _Elements.H2.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H2>)
End If
If _Elements.H3 IsNot Nothing Then
.Append(<H3>
<%= (From n In _Elements.H3.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H3>)
End If
If _Elements.H4 IsNot Nothing Then
.Append(<H4>
<%= (From n In _Elements.H4.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H4>)
End If
If _Elements.H5 IsNot Nothing Then
.Append(<H5>
<%= (From n In _Elements.H5.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H5>)
End If
If _Elements.H6 IsNot Nothing Then
.Append(<H6>
<%= (From n In _Elements.H6.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</H6>)
End If
If _Elements.UL IsNot Nothing Then
.Append(<UL>
<%= (From n In _Elements.UL.AsParallel()
Select
<Content><%= ConvertToCDATA(n) %></Content>).ToList() %>
</UL>)
End If
If _Elements.OL IsNot Nothing Then
.Append(<OL>
<%= (From n In _Elements.OL.AsParallel()
Select
<Content><%= ConvertToCDATA(n) %></Content>).ToList() %>
</OL>)
End If
If _Elements.STRONG IsNot Nothing Then
.Append(<STRONG>
<%= (From n In _Elements.STRONG.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</STRONG>)
End If
If _Elements.EM IsNot Nothing Then
.Append(<EM>
<%= (From n In _Elements.EM.AsParallel()
Select
<Content><%= XmlEscape(n) %></Content>).ToList() %>
</EM>)
End If
If _Elements.BLOCKQUOTE IsNot Nothing Then
.Append(<BLOCKQUOTE>
<%= (From n In _Elements.BLOCKQUOTE.AsParallel()
Select
<Content><%= ConvertToCDATA(n) %></Content>).ToList() %>
</BLOCKQUOTE>)
End If
If _Elements.A IsNot Nothing Then
.Append(<LINKS>
<%= (From n In _Elements.A.AsParallel()
Select
<Content>
<HREF><%= XmlEscape(n.Href) %></HREF>
<REL><%= XmlEscape(n.Rel) %></REL>
<TITLE><%= XmlEscape(n.Title) %></TITLE>
<TARGET><%= XmlEscape(n.Target) %></TARGET>
<CONTENT><%= XmlEscape(n.Content) %></CONTENT>
</Content>).ToList() %>
</LINKS>)
End If
If _Elements.IMG IsNot Nothing Then
.Append(<IMAGES>
<%= (From n In _Elements.IMG.AsParallel()
Select
<Content>
<SRC><%= XmlEscape(n.Source) %></SRC>
<ALT><%= XmlEscape(n.Alt) %></ALT>
<TITLE><%= XmlEscape(n.Title) %></TITLE>
</Content>).ToList() %>
</IMAGES>)
End If
End If
.Append(" </Elements>")
.Append(" <Content><![CDATA[" & _L(i).Content.ToString() & "]]></Content>")
.Append(" </Page>")
ReportProgress((_ct/_L.Count) * 100, _Worker)
Next
.Append("</o7th.Web.Design.Web.Spider>")
End With
Dim _xStr As String = _TmpStr.ToString()
_X = XDocument.Parse(_xStr)
_X.Save(ExportPath & "site.xml")
_X = Nothing
ReportProgress(100, _Worker)
Return True
Catch ex As Exception
'Put logging in here
Message = ex.Message & ":::Export.ExportXml"
Return False
End Try
End Function
: 여기
예외가 발생할 때 실행되는 코드이다 :Imports Superstar.Html.Linq
Public Class Typing
Partial Public Class SEO
Public Property Link As String
Public Property Title As String
Public Property Description As String
Public Property Keywords As String
Public Property Content As HElement
Public Property ContentElements As Elements
Partial Public Class Elements
Public Property H1 As List(Of String)
Public Property H2 As List(Of String)
Public Property H3 As List(Of String)
Public Property H4 As List(Of String)
Public Property H5 As List(Of String)
Public Property H6 As List(Of String)
Public Property UL As List(Of String)
Public Property OL As List(Of String)
Public Property STRONG As List(Of String)
Public Property BLOCKQUOTE As List(Of String)
Public Property EM As List(Of String)
Public Property A As List(Of Links)
Public Property IMG As List(Of Images)
Partial Public Class Images
Public Property Source As String
Public Property Alt As String
Public Property Title As String
End Class
Partial Public Class Links
Public Property Href As String
Public Property Rel As String
Public Property Title As String
Public Property Target As String
Public Property Content As String
End Class
End Class
End Class
End Class
ReportProgress는 이에 대한 Xaml 창의 백그라운드 작업자를보고하고 업데이트합니다 particual 상황이 진행 표시 줄 업데이트 :
Public Sub ReportProgress(ByVal ct As Integer, _Worker As ComponentModel.BackgroundWorker)
If _Worker IsNot Nothing Then
_Worker.ReportProgress(ct)
Threading.Thread.Sleep(500)
End If
End Sub
을하고 다운 클래스는 다음과 같습니다
Imports System.Reflection
Imports System.Net
Imports Superstar.Html.Linq
Public Class Downloader
Implements IDisposable
''' <summary>
''' Get the returned downloaded string
''' </summary>
''' <value></value>
''' <returns></returns>
''' <remarks></remarks>
Public ReadOnly Property ReturnString As String
Get
Return _StrReturn
End Get
End Property
Private Property _StrReturn As String
''' <summary>
''' Get the returned downloaded byte array
''' </summary>
''' <value></value>
''' <returns></returns>
''' <remarks></remarks>
Public ReadOnly Property ReturnBytes As Byte()
Get
Return _FSReturn
End Get
End Property
Private Property _FSReturn As Byte()
Private Property _UserAgent As String = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"
Private Property DataReceived As Boolean = False
''' <summary>
''' Download a string, but do not block the calling thread
''' </summary>
''' <param name="_Path"></param>
''' <remarks></remarks>
Public Sub DownloadString(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing)
SetAllowUnsafeHeaderParsing20()
Using wc As New Net.WebClient()
With wc
Dim _ct As Long = 0
DataReceived = False
.Headers.Add("user-agent", _UserAgent)
.DownloadStringAsync(New System.Uri(_Path))
AddHandler .DownloadStringCompleted, AddressOf StringDownloaded
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
End With
End Using
End Sub
''' <summary>
''' Download a file, but do not block the calling thread
''' </summary>
''' <param name="_Path"></param>
''' <remarks></remarks>
Public Sub DownloadFile(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing)
SetAllowUnsafeHeaderParsing20()
Using wc As New Net.WebClient()
With wc
Dim _ct As Long = 0
DataReceived = False
.Headers.Add("user-agent", _UserAgent)
.DownloadDataAsync(New System.Uri(_Path))
AddHandler .DownloadDataCompleted, AddressOf FileStreamDownload
Do While Not DataReceived
If _Worker IsNot Nothing Then
_ct += 1
ReportProgress(_ct, _Worker)
End If
Loop
End With
End Using
End Sub
''' <summary>
''' Download a parsable HDocument, for using HtmlToLinq
''' </summary>
''' <param name="_Path"></param>
''' <returns></returns>
''' <remarks></remarks>
Public Function DownloadHDoc(ByVal _Path As String, Optional ByVal _Worker As ComponentModel.BackgroundWorker = Nothing) As HDocument
Try
'StackOverFlowException Occurring Here!
DownloadString(_Path, _Worker)
Return HDocument.Parse(_StrReturn)
Catch soex As StackOverflowException
'put some logging in here, with the path attempted
Return Nothing
Catch ex As Exception
SetAllowUnsafeHeaderParsing20()
Return HDocument.Load(_Path)
End Try
End Function
#Region "Internals"
Private Sub SetAllowUnsafeHeaderParsing20()
Dim a As New System.Net.Configuration.SettingsSection
Dim aNetAssembly As System.Reflection.Assembly = Assembly.GetAssembly(a.GetType)
Dim aSettingsType As Type = aNetAssembly.GetType("System.Net.Configuration.SettingsSectionInternal")
Dim args As Object() = Nothing
Dim anInstance As Object = aSettingsType.InvokeMember("Section", BindingFlags.Static Or BindingFlags.GetProperty Or BindingFlags.NonPublic, Nothing, Nothing, args)
Dim aUseUnsafeHeaderParsing As FieldInfo = aSettingsType.GetField("useUnsafeHeaderParsing", BindingFlags.NonPublic Or BindingFlags.Instance)
aUseUnsafeHeaderParsing.SetValue(anInstance, True)
End Sub
Private Sub FileStreamDownload(ByVal sender As Object, ByVal e As DownloadDataCompletedEventArgs)
If e.Cancelled = False AndAlso e.Error Is Nothing Then
DataReceived = True
_FSReturn = DirectCast(e.Result, Byte())
Else
_FSReturn = Nothing
End If
End Sub
Private Sub StringDownloaded(ByVal sender As Object, ByVal e As DownloadStringCompletedEventArgs)
If e.Cancelled = False AndAlso e.Error Is Nothing Then
DataReceived = True
_StrReturn = DirectCast(e.Result, String)
Else
_StrReturn = String.Empty
End If
End Sub
#End Region
#Region "IDisposable Support"
Private disposedValue As Boolean ' To detect redundant calls
' IDisposable
Protected Overridable Sub Dispose(disposing As Boolean)
If Not Me.disposedValue Then
If disposing Then
End If
_StrReturn = Nothing
_FSReturn = Nothing
End If
Me.disposedValue = True
End Sub
Public Sub Dispose() Implements IDisposable.Dispose
Dispose(True)
GC.SuppressFinalize(Me)
End Sub
#End Region
End Class
제가 위에서 말했듯이 happenning 어떤 재귀가처럼, 그것은 보이지 않는다. (적어도 내게 붙어있는 사람은 아무도 없다.) 그래서 나는 그것이 곧 HD 문서 안에 있다고 가정한다.
이것이 어디에서 잘못되었으며 문제를 해결하는 방법을 알려주시겠습니까?
나는 몇 가지 조사를 수행하고, 기본 스택 크기는 1메가바이트 것을 이해, 그래서이 진정으로 내가이 증가하는 시도해야하는 특별한 상황 중 하나입니다 궁금했다 ...
난 후 발견 추적을 여러 번 보았습니다. 특정 페이지에 도달했을 때 항상 발생했습니다. 이 페이지는 단지 크기가 500k를 초과합니다. 여기
는 호출 스택입니다 :[External Code]
> o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Downloader.DownloadHDoc(String _Path, System.ComponentModel.BackgroundWorker _Worker) Line 95 + 0x1e bytes Basic
o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Export.PopulateSEOList(System.ComponentModel.BackgroundWorker _Worker) Line 513 + 0x65 bytes Basic
o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Export.ExportXml(System.ComponentModel.BackgroundWorker _Worker) Line 70 + 0x1e bytes Basic
o7th.Web.Design.Spider.Worker.dll!o7th.Web.Design.Spider.Worker.Export.RunExport(System.ComponentModel.BackgroundWorker _Worker) Line 30 + 0x17 bytes Basic
o7th.Web.Design.WebSpider.exe!o7th.Web.Design.WebSpider.ParseLinks.RunExport(Object sender, System.ComponentModel.DoWorkEventArgs e) Line 106 + 0x2c bytes Basic
[External Code]
과 지역 주민이 내가 그 위에 언급 페이지의 크기가 50 만 이상 나를 보여줍니다
추신 아마도 초기 목록을 정리할 때 여기에있는 시도를 볼 수 있습니다 (문제의 일부라고 생각 함) – Kevin
또 다른 메모. 그것을하는 페이지 수는 아닙니다. 추적을 여러 번보고 난 후에 특정 페이지를 공격했을 때 항상 발생했다고 생각했습니다. 이 페이지는 단지 크기가 500k를 초과합니다. – Kevin
어쨌든 코드에 결함이있는 것 같습니다. 너무 많은 병렬 처리 (imho)를 사용하는 것 외에도, 여러 작업/스레드는 (대부분) 스레드 안전하지 않은 _doc.Descendants를 반복합니다. 그리고 그런 경우에는 StackOverflow가 결과일까요? – igrimpe