Option Explicit Dim PrintRow As Integer Dim HomePath As String ' MstrList - vector of htm filenames captured ' ToDoList - vector of subfolders that need to be processed Dim MstrList, ToDoList, UpLinksIgnored, UpLinksIgnoredLoop As Variant _____________________________________________________________________________________ Private Sub BuildOneSiteMap() ' is button on SiteMap sheet PrintRow = Range("SitePrintFirstRow").Row BuildSiteMap GetSiteRowNum End Sub _____________________________________________________________________________________ Private Function GetSiteRowNum() As Integer Dim Row As Integer Row = Range("SiteNum").Value ' we check quality of row number If Row < 1 Or Row > Range("SiteInputMatrix").Rows.Count Then Sheets("SiteMap").Select Range("SiteNum").Select MsgBox "Must enter row number within range of input matrix: 1 to " & Range("SiteInputMatrix").Rows.Count & ".", vbCritical, "Aborted!" End ElseIf 0 < InStr(Range("SiteNum").Value, ".") Then Sheets("SiteMap").Select Range("SiteNum").Select MsgBox "Row Number must be Integer.", vbCritical, "Aborted!" End End If ' and album name existence If Len(Range("SiteInputMatrix").Cells(Row, 3).Value) = 0 Then Sheets("SiteMap").Select Range("SiteInputMatrix").Cells(Row, 3).Select MsgBox "Path for selected row is empty.", vbCritical, "Aborted!" End End If GetSiteRowNum = Row End Function _____________________________________________________________________________________ Private Sub BuildAllSiteMaps() ' is button on SiteMap sheet Dim i As Integer Dim StartTime As Variant PrintRow = Range("SitePrintFirstRow").Row StartTime = Time ClearSiteRepOutput For i = 1 To Range("SiteInputMatrix").Rows.Count If Range("SiteInputMatrix").Cells(i, 3).Value <> "" Then BuildSiteMap i, True PrintRow = PrintRow + 1 End If Next i Sheets("SiteMap").Cells(PrintRow, 1).Value = "Total Time taken: " & FormatSeconds(DateDiff("s", StartTime, Time)) Application.StatusBar = False End Sub _____________________________________________________________________________________ Private Sub BuildSiteMap(ByVal i As Integer, Optional NoClearFlag As Boolean) ' builds sitemap.html file for domain ' arg: i - row number in name matrix, NoClearFlag when processing all rows ' -> PDFs, TXTs, XLSs, must have lowercase extensions ' -> all paths must be relative. The ./ to the root is not supported. ' -> Any bad internal links return an error. Should check first with: https://www.totalvalidator.com/ ' -> does not allow for periods in folder names ' -> all instances of file call must have same case, or dups created ' -> href calls to folders MUST have '/' at end. web will work without, but reading from the hard disk won't ' -> upward only links are included, but not followed. bad practice to have them. ' -> You can control navigation stuff that messes up flow using these comments in the html code (only one on page needed): ' ' Dim DebugFlag As Boolean, PostProcessFlag As Boolean Dim j As Integer, k As Integer, NSi As Integer Dim a As String, DN As String, H As String, LoopsFldr As String Dim OptLbl As String, t As String Dim Diffs, HomePgFldrs, Locs, NakedPaths, NotInLocs, NP, NumSlashes, PgLinks, res, Rows, S, ST, StartTime, SubLinks, UPaths As Variant DebugFlag = False ' get local variables StartTime = Time If Not NoClearFlag Then ClearSiteRepOutput DN = Range("SiteInputMatrix").Cells(i, 1).Value PostProcessFlag = Range("SiteInputMatrix").Cells(i, 2).Value ' set some of the global variables HomePath = Range("SiteInputMatrix").Cells(i, 3).Value If Len(DN) = 0 Then DN = ExtractFilename(HomePath) UpLinksIgnored = Empty UpLinksIgnoredLoop = Empty ' ~~ home page file processing ' MstrList - vector of filenames, home page and and htms off home page ' ToDoList - vector of subfolders found on page, some with htm and some just folder name res = GetIndexFileData(HomePath) ' inital setting of these global variables MstrList = res(0) ToDoList = res(1) ' loop through files to find other files (adds to: MstrList, ToDoList) ' MstrList- doesn't change ' ToDoList - folders/htms under ToDoList added to end of ToDoList. we just added one level down. BuildSiteMapSub MstrList, "", HomePath ' if no files or subfolders on home page, we are done If IsEmpty(ToDoList) Then GoTo CreatePage ' ~~ remove first folder underneath names from subfolder list ' as we will be processing them in the next section ReDim HomePgFldrs(UBound(ToDoList)) For i = 0 To UBound(ToDoList) HomePgFldrs(i) = Left(ToDoList(i), InStr(ToDoList(i), "/")) Next i HomePgFldrs = RemDupSameOrder(HomePgFldrs) ToDoList = Without(ToDoList, HomePgFldrs) ' ~~~ loop for subfolders on home page For i = 0 To UBound(HomePgFldrs) LoopsFldr = HomePgFldrs(i) Again: ' index file processing ' returns vector nest: 1-list all htms and pdfs on page, 2-list subfolder links on page res = GetIndexFileData(HomePath & "\" & DropStr(LoopsFldr, -1)) PgLinks = res(0) SubLinks = res(1) ' we want index file name to be first for this folder (so add to end of MstrList variable first) If Not IsEmpty(PgLinks) Then AddToMstrList LoopsFldr & PgLinks(0) End If ' add other files already found if in this folder (add to master list and remove from subfolders to process) If Not IsEmpty(ToDoList) Then For j = 0 To UBound(ToDoList) If LoopsFldr = Left(ToDoList(j), 1 + Len(ToDoList(j)) - InStr(Reverse(ToDoList(j)), "/")) Then AddToMstrList ToDoList(j) ToDoList(j) = Empty End If Next j ToDoList = RemoveEmptiesInVector(ToDoList) End If ' ~~ add what we found in index file and process If Not IsEmpty(PgLinks) Then ' saves folders and htms to process later If Not IsEmpty(SubLinks) Then For j = 0 To UBound(SubLinks) ToDoList = AppendVectors(ToDoList, LinkMerge(LoopsFldr, SubLinks(j))) Next j End If ' adds this page's htms to master list (before we add what's below) For j = 0 To UBound(PgLinks) PgLinks(j) = LinkMerge(LoopsFldr, PgLinks(j)) AddToMstrList PgLinks(j) Next j ' loop through files to find other files below BuildSiteMapSub PgLinks, LoopsFldr, HomePath End If ' we return to the top of the loop if there is more to process under this home page link If Not IsEmpty(UpLinksIgnoredLoop) Then ' give links the rest of their paths, and save for processing at the end For k = 0 To UBound(UpLinksIgnoredLoop) UpLinksIgnoredLoop(k) = LinkMerge(LoopsFldr, UpLinksIgnoredLoop(k)) Next k UpLinksIgnored = AppendVectors(UpLinksIgnored, UpLinksIgnoredLoop) UpLinksIgnoredLoop = Empty End If If Not IsEmpty(ToDoList) Then For j = 0 To UBound(ToDoList) ' looking for first on the to-process list in the same primary folder If HomePgFldrs(i) = Left(ToDoList(j), Len(HomePgFldrs(i))) Then LoopsFldr = ExtractPath(ToDoList(j)) ToDoList = Without(ToDoList, LoopsFldr) GoTo Again End If Next j End If Next i ' ~~~~~ collection processing done ~~~~~ ' errors check (it is possible that this can never be, but in case) If Not IsEmpty(ToDoList) Then Application.ScreenUpdating = False WriteVarToSheet MstrList, "SM-MstrList" With Sheets("SiteMap") .Cells(PrintRow, 1).Value = "Contact developer." PrintRow = PrintRow + 1 .Cells(PrintRow, 1).Value = "There were " & UBound(ToDoList) + 1 & " left in the subfolder variable. Returned results would not be complete. Errors are below." PrintRow = PrintRow + 1 .Cells(PrintRow, 1).Value = "MstrList written to SM-MstrList sheet." PrintRow = PrintRow + 1 For i = 0 To UBound(ToDoList) .Cells(PrintRow + i, 1).Value = ToDoList(i) Next i End With Application.StatusBar = False End End If ' ~~ we print out the uplinks we found and didn't process If Not IsEmpty(UpLinksIgnored) Then Application.StatusBar = "Checking uplinks...." UpLinksIgnored = RemDupSameOrder(UpLinksIgnored) For i = 0 To UBound(UpLinksIgnored) If Right(UpLinksIgnored(i), 1) = "/" Then UpLinksIgnored(i) = UpLinksIgnored(i) & GetIndexFileName(HomePath & "\" & UpLinksIgnored(i)) End If If IsMember(MstrList, UpLinksIgnored(i)) Then UpLinksIgnored(i) = Empty End If Next i UpLinksIgnored = RemoveEmptiesInVector(UpLinksIgnored) If Not IsEmpty(UpLinksIgnored) Then With Sheets("SiteMap") .Cells(PrintRow, 1).Value = DN & ": Upward only links were found. Good were added to the end of their folder." PrintRow = PrintRow + 1 .Cells(PrintRow, 1).Value = "Note that any links in these files where not checked or processed." PrintRow = PrintRow + 1 .Cells(PrintRow, 1).Value = "Best to not have upward only links!" PrintRow = PrintRow + 1 ' loop for links For j = 0 To UBound(UpLinksIgnored) ' first we check the quality of the link S = HomePath & "\" & UpLinksIgnored(j) If Not FileExists(S) Then .Cells(PrintRow, 1).Value = "File doesn't exist:" .Cells(PrintRow, 3).Value = S PrintRow = PrintRow + 1 ' add up-only links after the last of its naked path entries Else Locs = LocsInVector(ReturnNakedPaths(MstrList, True), ExtractPath(UpLinksIgnored(j))) If IsEmpty(Locs) Then .Cells(PrintRow, 1).Value = "You have a bad link somewhere. Run Total Validator Pro. OR" PrintRow = PrintRow + 1 .Cells(PrintRow, 1).Value = "Not found in list of naked paths: " & UpLinksIgnored(j) PrintRow = PrintRow + 1 .Cells(PrintRow, 1).Value = "Do Find -- paying attention to case -- on: " & ExtractFilename(ExtractPath(UpLinksIgnored(j), True)) Else k = Locs(UBound(Locs)) + 1 res = AppendVectors(Array(UpLinksIgnored(j)), DropVec(MstrList, k)) MstrList = AppendVectors(Take(MstrList, k), res) .Cells(PrintRow, 1).Value = "Added:" .Cells(PrintRow, 3).Value = S End If PrintRow = PrintRow + 1 End If Next j End With End If End If ' remove disallows from list ' -> I question whether this should exist at all If FileExists(HomePath & "\robots.txt") Then Application.StatusBar = "Removing Disallows...." res = ReturnFileAsArrayOfRows(HomePath & "\robots.txt") For i = 0 To UBound(res) If Left(res(i), 10) = "Disallow: " Then t = DropStr(res(i), 11) ' remove only files that have an alt sort. we want folders to still appear in HTML sitemap If Right(t, 1) <> "/" Then If IsMember(MstrList, t) Then MstrList = Without(MstrList, t) End If End If Next i End If ' ~~ debugging: list contents of filenames captured in MstrList variable If DebugFlag Then Application.ScreenUpdating = False WriteVarToSheet MstrList, "SM-MstrList" End If ' ~~~~~ Post processing clean up ~~~~~ If Not PostProcessFlag Then GoTo CreatePage ' ~~ making members of same folder contiguous Application.StatusBar = "Cleaning up the link order, pass #1 (combining folders into contiguous blocks)..." ST = Time ' combine folders into contiguous blocks NP = ReturnNakedPaths(MstrList, True) UPaths = RemDupSameOrder(NP) ' loop for each unique path (skipping first as it is the root) For i = 1 To UBound(UPaths) NP = ReturnNakedPaths(MstrList, True) ' ~~ test for dispersion ' find differences between members of folder Locs = LocsInVector(NP, UPaths(i)) Diffs = DiffsInVector(Locs) If WorksheetFunction.Max(Diffs) > 1 Then ' find beginning of good block to put early birds in front of (can't be first) Diffs(0) = 0 ' this returns location in base 0 of first contiguous entries j = Iota2(Diffs, 1) Rows = LocsInVector(NP, UPaths(i), False) res = AppendVectors(IndexIntoVector(MstrList, Take(Locs, j)), DropVec(MstrList, Locs(j))) MstrList = AppendVectors(Take(IndexIntoVector(MstrList, Rows), Locs(j) - j), res) End If Next i Sheets("SiteMap").Cells(PrintRow, 1).Value = "Pass #1 time: " & DateDiff("s", ST, Time) & " seconds" PrintRow = PrintRow + 1 ' ~~ we may still have a gap. so we look for gaps and move together behind first (move gap to end) Application.StatusBar = "Cleaning up the link order, pass #2 (moving sub-folder calls -- e.g. to PDFs -- to end of folder)..." ST = Time NP = ReturnNakedPaths(MstrList, True) UPaths = RemDupSameOrder(NP) ' loop for each unique path (skipping root) For i = 1 To UBound(UPaths) NP = ReturnNakedPaths(MstrList, True) ' test for gap (images folder poked into the middle, this moves to end) Locs = LocsInVector(NP, UPaths(i)) Diffs = DiffsInVector(Locs) If WorksheetFunction.Max(Diffs) > 1 Then j = Iota2(Diffs, 1, False) Rows = AppendVectors(DropVec(Locs, j + 1), AddToVector(Iota(Diffs(j) - 2), Locs(j) + 1)) res = AppendVectors(IndexIntoVector(MstrList, Rows), DropVec(MstrList, Locs(UBound(Locs)) + 1)) MstrList = AppendVectors(Take(MstrList, Locs(j) + 1), res) End If Next i Sheets("SiteMap").Cells(PrintRow, 1).Value = "Pass #2 time: " & DateDiff("s", ST, Time) & " seconds" PrintRow = PrintRow + 1 ' ~~ moving early bird PDFs put them in front of index files. so we fix and make index always first Application.StatusBar = "Cleaning up the link order, pass #3 (making sure index file is first in folder)..." ST = Time NP = ReturnNakedPaths(MstrList, True) UPaths = RemDupSameOrder(NP) For i = 1 To UBound(UPaths) NP = ReturnNakedPaths(MstrList, True) Locs = LocsInVector(NP, UPaths(i)) For k = 1 To UBound(Locs) ' look for index file for this unique path, and it not being first If 0 < InStr(MstrList(Locs(k)), "/index.") Then Rows = IndexIntoVector(Locs, AppendVectors(k, Iota(k - 1))) res = AppendVectors(IndexIntoVector(MstrList, Rows), DropVec(MstrList, Locs(k) + 1)) MstrList = AppendVectors(Take(MstrList, Locs(0)), res) GoTo FoundIndex End If Next k FoundIndex: Next i Sheets("SiteMap").Cells(PrintRow, 1).Value = "Pass #3 time: " & DateDiff("s", ST, Time) & " seconds" PrintRow = PrintRow + 1 ' ~~ this pass is looking for sub-subfolders that have been split apart from their parent folder ' (due to my processing one folder at a time is only semi-recursive) Application.StatusBar = "Cleaning up the link order, pass #4 (bring sub-subfolders to parent)..." ST = Time NP = ReturnNakedPaths(MstrList, True) UPaths = RemDupSameOrder(NP) For i = 1 To UBound(UPaths) NP = ReturnNakedPaths(MstrList, True) Locs = LocsInVector(NP, UPaths(i), , True) Diffs = DiffsInVector(Locs) If WorksheetFunction.Max(Diffs) > 1 Then NotInLocs = Without(AddToVector(Iota(Locs(UBound(Locs)) - Locs(0)), Locs(0)), Locs) ReDim res(UBound(MstrList)) For j = 0 To UBound(res) ' before the UPath we are working on If j < Locs(0) Then res(j) = MstrList(j) ' members within the UPath ElseIf Locs(0) <= j And j <= Locs(0) + UBound(Locs) Then res(j) = MstrList(Locs(j - Locs(0))) ' everybody else goes behind ElseIf Locs(0) + UBound(Locs) < j And j < Locs(UBound(Locs)) + 1 Then ' now the not used members within the UPath block res(j) = MstrList(NotInLocs(j - (Locs(0) + UBound(Locs) + 1))) Else res(j) = MstrList(j) End If Next j MstrList = res End If Next i Sheets("SiteMap").Cells(PrintRow, 1).Value = "Pass #4 time: " & DateDiff("s", ST, Time) & " seconds" PrintRow = PrintRow + 1 ' this means something failed ' -> will crash if MastrList is empty If UBound(RemDupSameOrder(MstrList)) <> UBound(MstrList) Then res = ReturnDups(MstrList) For i = 0 To UBound(res) Sheets("SiteMap").Cells(PrintRow, 1).Value = "Duplicate in MstrList: " & res(i) PrintRow = PrintRow + 1 Next i Sheets("SiteMap").Cells(PrintRow, 1).Value = "Processing Halted" Stop End If ' ~~~~~ create page ~~~~~ CreatePage: Application.StatusBar = "Building web page for " & DN & "..." NSi = -1 res = ReturnNakedPaths(MstrList) NumSlashes = res(0) NakedPaths = res(1) H = "" & vbCrLf H = H & "" & vbCrLf H = H & "" & vbCrLf H = H & "" & ExtractPageTitle(HomePath & "\" & MstrList(0)) & " Site Map" & vbCrLf H = H & "" & vbCrLf H = H & "" & vbCrLf H = H & "" & vbCrLf H = H & "" & vbCrLf & vbCrLf H = H & "
" & vbCrLf H = H & "

" & DN & " Site Map

" & vbCrLf & vbCrLf H = H & "

" & DN & " Homepage" & vbCrLf H = H & "
Total pages: " & Format(UBound(MstrList) + 1, "#,##0") & "

" & vbCrLf H = H & "
" & vbCrLf H = H & "
" & vbCrLf For i = 0 To UBound(MstrList) ' sometimes a heading If i = 0 Or NakedPaths(i) <> NakedPaths(WorksheetFunction.Max(0, i - 1)) Then If NSi = NumSlashes(i) Then H = H & vbCrLf & "" & vbCrLf & vbCrLf End If If NumSlashes(i) = 0 Then a = "/" Else a = ExtractPath(MstrList(i)) End If H = H & " 0 Then H = H & " class=""level-" & NumSlashes(i) + 1 & " has-pages""" End If H = H & ">" & vbCrLf ' count number of pages for this heading k = 0 For j = i To UBound(NakedPaths) If NakedPaths(i) = NakedPaths(j) Then k = k + 1 Else GoTo FoundNextFolder End If Next j FoundNextFolder: t = "" If k > 1 Then t = "s" H = H & "
  • " & a & "   " & k & " page" & t & vbCrLf & vbCrLf End If NSi = NumSlashes(i) ' we have to look up the title (strings added to array must have 4 characters) If IsMember(Array(".pdf", ".txt", ".xls"), Right(MstrList(i), 4)) Then t = ExtractFilename(MstrList(i)) t = Replace(t, "%20", " ") t = Replace(t, "%22", """") t = Replace(t, "%5B", "[") t = Replace(t, "%5D", "]") Else t = HomePath & "\" & MstrList(i) If FileExists(t) Then t = ExtractPageTitle(t) Else Application.StatusBar = False Sheets("SiteMap").Cells(PrintRow, 1).Value = "Processing halted. Bad path found:" PrintRow = PrintRow + 1 Sheets("SiteMap").Cells(PrintRow, 2).Value = t PrintRow = PrintRow + 1 Sheets("SiteMap").Cells(PrintRow, 1).Value = "Possibly a bad upward link combined to create one invalid." GoTo CleanUp End If End If ' the entry (square brackets not allowed in URL) S = MstrList(i) S = Replace(MstrList(i), "[", "%5B") S = Replace(MstrList(i), "]", "%5D") H = H & "
  • " & t & "" & vbCrLf ' only when next level change is downwards S = AppendVectors(NumSlashes, Array(0)) If NumSlashes(i) > S(i + 1) Then H = H & vbCrLf & "" & vbCrLf & vbCrLf ' and another when we change primary folder S = "" If i <> UBound(NakedPaths) Then S = NakedPaths(i + 1) If NakedPaths(i) <> S Then H = H & "" & vbCrLf & vbCrLf End If ' also needed when we jump up two levels (and another for 3, but we should rarely have) S = 0 If i <> UBound(NumSlashes) Then S = NumSlashes(i + 1) S = NumSlashes(i) - S If S > 1 Then t = Left("", 5 * (S - 1)) H = H & t & vbCrLf & vbCrLf End If ElseIf i = UBound(MstrList) Then H = H & vbCrLf & "" & vbCrLf & vbCrLf End If Next i ' ~~ wrap up and save page H = H & "
  • " & vbCrLf H = H & "" & vbCrLf SaveIfChanged HomePath & "\sitemap.html", H ' ~~ print stats report on sheet ReDim S(3) S(0) = CountInString(H, "a href=") S(1) = CountInString(H, ".pdf") / 2 S(2) = CountInString(H, ".txt") / 2 S(3) = CountInString(H, ".xls") / 2 j = S(1) + S(2) + S(3) With Sheets("SiteMap") If S(0) = 2 Then .Cells(PrintRow, 1).Value = DN & ": When the site has only one file, there is no point in having a sitemap file. Generated anyway." PrintRow = PrintRow + 1 End If .Cells(PrintRow, 1).Value = DN & ": Total files (inc. sitemap.html): " & Format(S(0), "#,##0") PrintRow = PrintRow + 1 If j > 0 Then .Cells(PrintRow, 1).Value = "Number of Files: " PrintRow = PrintRow + 1 .Cells(PrintRow, 2).Value = ".htm files: " & Format(S(0) - j, "#,##0") PrintRow = PrintRow + 1 If S(1) > 0 Then .Cells(PrintRow, 2).Value = ".pdf files: " & S(1) PrintRow = PrintRow + 1 End If If S(2) > 0 Then .Cells(PrintRow, 2).Value = ".txt files: " & S(2) PrintRow = PrintRow + 1 End If If S(3) > 0 Then .Cells(PrintRow, 2).Value = ".xls files: " & S(3) PrintRow = PrintRow + 1 End If End If res = DateDiff("s", StartTime, Time) If res > 3 Then .Cells(PrintRow, 1).Value = "Time taken: " & FormatSeconds(res) PrintRow = PrintRow + 1 End If End With CleanUp: Application.Goto Range("A13"), True Application.StatusBar = False End Sub _____________________________________________________________________________________ Private Sub BuildSiteMapSub(ByVal List As Variant, ByVal a As String, ByVal Path As String) ' arg: List - list of folder names to process ' A - sub folder prefix, includes trailing backslash (empty string for root) ' Path - home folder path ' adds to: MstrList, ToDoList Dim i As Integer, j As Integer Dim ExtenList() As String Dim links, res, t As Variant ExtenList = Split(".pdf .txt .xls") ' we start at 1, as we have already processed the index page For i = 1 To UBound(List) AddToMstrList List(i) ' why was the Or Right(List(i), 1) = "/" added? It isn't in APL ' of course ExtractSiteMapLinks wants a filename, but why was a folder name here (at least once upon a time) If Not (IsMember(ExtenList, Right(List(i), 4)) Or Right(List(i), 1) = "/") Then ' extract all links on page res = ExtractSiteMapLinks(Path & "\" & List(i)) links = res(0) ' add the folders to the sub-folder list t = res(1) If Not IsEmpty(t) Then For j = 0 To UBound(t) t(j) = LinkMerge(a, t(j)) Next j If IsEmpty(ToDoList) Then ToDoList = t Else ToDoList = Add2Collection(ToDoList, t) End If End If ' loop to find links that are new Do While Not IsEmpty(links) ' if it already exists in list we drop it and move to next If IsEmpty(links(0)) Then links = DropVec(links, 1) ElseIf IsMember(MstrList, a & links(0)) Then ' this is where when we know we are on the mother page, we remove prior and put this one in ' -> is moving some PDFs from correct position on page to bottom of page. if 1st stays first ' -> is moving w00.htm's (or whatever the first is named) from second to index, to last ' -> in marketing puts the PDFs next to htm. APL has all grouped in APL images/ folder ' MstrList = Without(MstrList, Array(A & Links(0))) ' MstrList = AppendVectors(MstrList, Array(A & Links(0))) links = DropVec(links, 1) Else ' new, so we add AddToMstrList LinkMerge(a, links(0)) ' non-htm files have no more processing (empty Links(0) is ignored) If IsMember(ExtenList, Right(links(0), 4)) Then links = DropVec(links, 1) Else ' if the link is an htm file, we also extract links on it, and add to list of links to process ' but is only extracting one level down. if links two levels down they get picked up later, ' but should be getting picked up now. ExtractSiteMapLinks s/b recursive, and then all processed res = ExtractSiteMapLinks(Path & "\" & a & links(0)) ' the files links = AppendVectors(res(0), DropVec(links, 1)) ' the folders res = res(1) If Not IsEmpty(res) Then For j = 0 To UBound(res) res(j) = LinkMerge(a, res(j)) Next j ToDoList = Add2Collection(ToDoList, res) End If End If End If Loop End If Next i End Sub _____________________________________________________________________________________ Function GetIndexFileData(ByVal Path As String) As Variant ' Path - folder path ' reads: HomePath ' returns: nest of PgList ToDoList Dim Name As String, t As String Dim r0, res As Variant ' extract links in page. can't be any upward looking on index page. Name = GetIndexFileName(Path) If Len(Name) > 0 Then t = DropStr(Path, Len(HomePath)) If 0 < InStr(t, ":") Then MsgBox "You have a typo in a link that must be fixed before processing." & vbLf & "In this file: " & HomePath & "\" & Name & vbLf & "Search for: " & DropStr(t, 1), vbCritical, "Aborted!" End End If DoEvents Application.StatusBar = "Processing: " & Replace(Path, "/", "\") & "..." res = ExtractSiteMapLinks(Path & "\" & Name) ' also add this page's name to the front of the list r0 = res(0) If IsEmpty(r0) Then res(0) = Array(Name) Else res(0) = AppendVectors(Array(Name), res(0)) End If Else ReDim res(1) End If GetIndexFileData = res End Function _____________________________________________________________________________________ Function ExtractSiteMapLinks(FileName As String) As Variant ' from html page extracts the links that sitemap uses ' returns: nest of file names and subfolder names Dim b As Boolean, Ext As Boolean Dim i As Integer, j As Integer Dim D As String, t As String Dim ExtenList, ExtList, FileList, links, Locs, res, SubList As Variant FileName = Replace(FileName, "/", "\") D = GetPageAfterNav(FileName) If Len(D) = 0 Then ExtractSiteMapLinks = res Exit Function End If Locs = FindAllLinkLocations(D, "Sitemap Extraction") ' find all link locations (but none that are upward looking). will have duplicates links = ExcludeLinks(D, Locs, Array(" href=""http", " href="".", " href=""#", " href=\""mai", " href=""sitemap")) ' ~~ add any upward PDFs, TXTs, XLSs on page ExtenList = Split(".pdf .txt .xls") ' start with all upward looking link locations ' when page has no navbar, this will catch the backwards links in the upper left Locs = IncludeLinks(D, Locs, Array(" href=""../")) ' keep only the ones that don't require further processing If Not IsEmpty(Locs) Then ' any additions to list need to be 4 characters long res = TurnLocs2Links(D, Locs) For i = 0 To UBound(Locs) b = True For j = 0 To UBound(ExtenList) If Right(res(i), Len(ExtenList(j))) = ExtenList(j) Then b = False Next j If b Then Locs(i) = "" ' this is capturing upward paths that we are ignoring ' -> why did we bother to collect PDFs before? couldn't they somply go in here? t = Right(res(i), 4) b = t = ".htm" Or t = "html" Or t = ".txt" Or t = ".xls" If b Or (Right(res(i), 1) = "/" And Len(WithoutStrVer(res(i), "./")) > 0) Then UpLinksIgnoredLoop = AppendVectors(UpLinksIgnoredLoop, res(i)) End If End If Next i Locs = RemoveEmptiesInVector(Locs) If Not IsEmpty(Locs) Then links = AppendVectors(links, Locs) ' now sort, so these new ones are in their correct location ' this is incompatible with pass #2 cleanup. I think. if you turn this on, you must turn off that ' Links = GradeUp(Links, False, True) End If End If ' ~~ convert locations to the actual links links = TurnLocs2Links(D, links) ' ~~ collect what we found If Not IsEmpty(links) Then ' remove any anchors For i = 0 To UBound(links) j = InStr(links(i), "#") If j > 0 Then links(i) = Left(links(i), j - 1) End If Next i ' remove duplicates (but still in same order) links = RemDupSameOrder(links) ' certain file types (e.g. no images) that we want to keep ExtList = Split("htm html shtml pdf txt xls") ReDim FileList(UBound(links)) ReDim SubList(UBound(links)) For i = 0 To UBound(links) ' is this definitely a sub folder? (the easy test first) ' and remove any root slashes (shouldn't be, could test to see if ever exists) If (Left(links(i), 1) = "/" Or Right(links(i), 1) = "/") And links(i) <> "/" Then SubList(i) = links(i) Else ' keep only certain file types (e.g. no images) If IsMember(ExtList, ExtractExtension(links(i))) Then ' when I had xls here it messed up floorborders's spreadsheets and mixed them into the main folder ' what is really needed is to put into sublist all upward links, as they can't be processed ' if they end up in the SubList, they will end up not being used and the processing will halt If InStr(links(i), "/") > 0 And Not (Right(links(i), 3) = "pdf" Or Right(links(i), 3) = "txt") Then SubList(i) = links(i) Else FileList(i) = links(i) End If End If End If Next i FileList = RemoveEmptiesInVector(FileList) SubList = RemoveEmptiesInVector(SubList) End If ExtractSiteMapLinks = Array(FileList, SubList) End Function _____________________________________________________________________________________ Function ReturnNakedPaths(ByVal MstrList As Variant, Optional OnlyPathsFlag As Boolean) As Variant ' returns nest of: 1-vector number of slashes in each row, 2-vector of naked paths (or empty vectors) ' unless OnlyPathsFlag is on, then returns just nest of naked paths Dim i As Integer, NumSlashes() As Integer Dim NakedArray() As String ReDim NumSlashes(UBound(MstrList)) ReDim NakedArray(UBound(MstrList)) For i = 0 To UBound(MstrList) ' number of slashes in each row NumSlashes(i) = CountInString(MstrList(i), "/") ' naked array. ones in the root get the initialized string of "" as default If NumSlashes(i) > 0 Then NakedArray(i) = ExtractPath(MstrList(i)) End If Next i ' return If OnlyPathsFlag Then ReturnNakedPaths = NakedArray Else ReturnNakedPaths = Array(NumSlashes, NakedArray) End If End Function _____________________________________________________________________________________ Function LinkMerge(ByVal Path As String, ByVal LinkStr As String) As String ' merges a path with an upward link string ' args: Path and LinkStr - paths to merge ' assumes that Path has trailing forward slash / Dim t As Variant ' we loop to handle multiple levels up Do While Left(LinkStr, 3) = "../" ' drop first 3 characters LinkStr = DropStr(LinkStr, 3) ' this test gets rids of links that are far away ' -> but is a kludge. attempt to pass on code messed up too many things ' -> so now spurious path message in report If Path <> "" Then t = DropVec(AllInStr(Path, "/"), -1) If IsEmpty(t) Then Path = "" Else Path = Left(Path, t(UBound(t))) End If End If Loop LinkMerge = Path & LinkStr End Function _____________________________________________________________________________________ Sub AddToMstrList(ByVal PathName As String) ' adds to Master Path global list ' arg: simple string vector of pathname ' reads: MstrList If Not IsMember(MstrList, PathName) Then MstrList = AppendVectors(MstrList, Array(PathName)) End If End Sub _____________________________________________________________________________________ Function TurnLocs2Links(ByVal D As String, ByVal Locs As Variant) As Variant ' if Locs is empty, then returns empty Dim i As Integer Dim dat As String, msg As String, t As String Dim FileList, FL, res As Variant If Not IsEmpty(Locs) Then ReDim res(UBound(Locs)) For i = 0 To UBound(Locs) t = DropStr(D, Locs(i)) t = DropStr(t, InStr(t, """")) t = Left(t, InStr(t, """") - 1) If t = "" Then ' find files located in FileList = ListFiles(HomePath, "htm", True, True) For Each FL In FileList dat = ReadInFile(FL) If 0 < InStr(dat, """""") Then t = UBound(AllInStr(dat, """""")) + 1 msg = msg & DropStr(FL, Len(HomePath)) & " - " & t & " times" & vbLf End If Next FL MsgBox "You have empty links, e.g. href="""". Must fix first. Located at:" & vbLf & vbLf & HomePath & vbLf & vbLf & msg & vbLf & "Alt-PrtScreen can save this Message Box in the clipboard.", vbCritical, "Aborted!" End End If res(i) = t Next i End If TurnLocs2Links = res End Function _____________________________________________________________________________________ Private Sub SortInputRangeSite() ' is button on SiteMap sheet. MUST be on sheet Range("SiteInputMatrix").Sort Key1:=Range("d6") End Sub _____________________________________________________________________________________ Private Sub ClearSiteRepOutput() ' is button on SiteMap sheet Sheets("SiteMap").Range("A" & Range("SitePrintFirstRow").Row & ":C65536").Clear Application.Goto Range("A1"), True Range("SiteNum").Select End Sub _____________________________________________________________________________________ Private Sub GoToBuilders() ClearSiteRepOutput Sheets("Dashboard").Select End Sub _____________________________________________________________________________________ Sub CreateSiteMapFromOtherSheet(ByVal Path As String) Dim i As Integer Dim t As String, UPath As String UPath = UCase(Path) ' loop through paths on SiteMap sheet to find which one For i = 1 To Range("SiteInputMatrix").Rows.Count t = UCase(Range("SiteInputMatrix").Cells(i, 3).Value) If t = Left(UPath, Len(t)) And t <> "" Then GoTo FoundIt Next i MsgBox "Path Not Found on SiteMap sheet:" & vbLf & Path, vbCritical, "Aborted!" Exit Sub FoundIt: Sheets("SiteMap").Select Range("SiteNum").Value = i BuildOneSiteMap End Sub _____________________________________________________________________________________ Private Sub ViewSiteMap() ' is button on SiteMap sheet ActiveWorkbook.FollowHyperlink Range("SiteInputMatrix").Cells(GetSiteRowNum, 3).Value & "\sitemap.html" End Sub _____________________________________________________________________________________