projects
/
tclrobot.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
98d2fc5
)
Bug fix for relative links.
author
Adam Dickmeiss
<adam@indexdata.dk>
Thu, 7 Jun 2001 08:10:10 +0000
(08:10 +0000)
committer
Adam Dickmeiss
<adam@indexdata.dk>
Thu, 7 Jun 2001 08:10:10 +0000
(08:10 +0000)
robot.tcl
patch
|
blob
|
history
diff --git
a/robot.tcl
b/robot.tcl
index
5c2b518
..
fa3c595
100755
(executable)
--- a/
robot.tcl
+++ b/
robot.tcl
@@
-1,8
+1,8
@@
#!/usr/bin/tclsh
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $
+# $Id: robot.tcl,v 1.17 2001/06/07 08:10:10 adam Exp $
#
proc RobotFileNext1 {area lead} {
#
proc RobotFileNext1 {area lead} {
- puts "RobotFileNext1 area=$area lead=$lead"
+ # puts "RobotFileNext1 area=$area lead=$lead"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
@@
-43,7
+43,7
@@
proc RobotReadRecord {inf fromurlx distancex} {
gets $inf
gets $inf
set distance [string trim [gets $inf]]
gets $inf
gets $inf
set distance [string trim [gets $inf]]
- puts "got distance = $distance"
+ # puts "got distance = $distance"
gets $inf
gets $inf
set fromurl [string trim [gets $inf]]
gets $inf
gets $inf
set fromurl [string trim [gets $inf]]
@@
-52,7
+52,7
@@
proc RobotReadRecord {inf fromurlx distancex} {
proc RobotFileNext {area} {
global robotSeq global idleTime ns
proc RobotFileNext {area} {
global robotSeq global idleTime ns
- puts "RobotFileNext robotSeq=$robotSeq"
+ # puts "RobotFileNext robotSeq=$robotSeq"
if {$robotSeq < 0} {
return {}
}
if {$robotSeq < 0} {
return {}
}
@@
-87,27
+87,27
@@
proc RobotFileNext {area} {
proc RobotFileExist {area host path} {
proc RobotFileExist {area host path} {
- puts "RobotFileExist begin area=$area host=$host path=$path"
+ # puts "RobotFileExist begin area=$area host=$host path=$path"
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- puts "RobotFileExist end npath=$npath"
+ # puts "RobotFileExist end npath=$npath"
return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
- puts "RobotFileUnlink begin"
- puts "area=$area host=$host path=$path"
+ # puts "RobotFileUnlink begin"
+ # puts "area=$area host=$host path=$path"
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- puts "npath=$npath"
+ # puts "npath=$npath"
set comp [split $npath /]
set l [llength $comp]
incr l -1
set comp [split $npath /]
set l [llength $comp]
incr l -1
@@
-118,7
+118,7
@@
proc RobotFileUnlink {area host path} {
if {![catch {glob $path/*}]} return
exec rmdir ./$path
}
if {![catch {glob $path/*}]} return
exec rmdir ./$path
}
- puts "RobotFileUnlink end"
+ # puts "RobotFileUnlink end"
}
proc RobotFileClose {out} {
}
proc RobotFileClose {out} {
@@
-134,7
+134,7
@@
proc RobotFileOpen {area host path {mode w}} {
if {![info exists workdir]} {
return stdout
}
if {![info exists workdir]} {
return stdout
}
- puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
@@
-203,7
+203,7
@@
proc RobotStart {} {
global URL
global robotsRunning robotsMax idleTime
global URL
global robotsRunning robotsMax idleTime
- puts "RobotStart"
+ # puts "RobotStart"
while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
@@
-313,28
+313,31
@@
proc RobotHref {url hrefx hostx pathx} {
set surl $dpart/$surl
}
}
set surl $dpart/$surl
}
}
- set c [split $surl /]
- set i [llength $c]
- incr i -1
- set path [lindex $c $i]
- incr i -1
- while {$i >= 0} {
- switch -- [lindex $c $i] {
+ set surllist [split $surl /]
+ catch {unset path}
+ set pathl 0
+ foreach c $surllist {
+ switch -- $c {
.. {
.. {
- incr i -2
- if {$i < 0} {
- set i 0
+ if {$pathl > 0} {
+ incr pathl -1
+ set path [lrange $path 0 $pathl]
}
}
}
}
- . {
- incr i -1
- }
- default {
- set path [lindex $c $i]/$path
- incr i -1
+ . {
+
+ }
+ default {
+ incr pathl
+ lappend path $c
}
}
}
}
}
}
+ if {$pathl} {
+ set path [join $path /]
+ } else {
+ set path ""
+ }
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
puts "Ref href = $href"
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
puts "Ref href = $href"
@@
-429,7
+432,7
@@
proc RobotTextHtml {url out} {
}
puts $out {></meta>}
} body {
}
puts $out {></meta>}
} body {
- regsub -all -nocase {<script.*</script>} $body {} abody
+ regsub -all -nocase {<script([^<]|(<!.*>))*</script>} $body {} abody
regsub -all {<[^\>]+>} $abody {} nbody
puts $out "<documentcontent>"
puts $out $nbody
regsub -all {<[^\>]+>} $abody {} nbody
puts $out "<documentcontent>"
puts $out $nbody
@@
-587,6
+590,10
@@
proc RobotTextPlain {url out} {
proc Robot200 {url} {
global URL domains
proc Robot200 {url} {
global URL domains
+ set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
+ puts -nonewline $out $URL($url,buf)
+ RobotFileClose $out
+
set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
puts $out "<zmbot>"
set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
puts $out "<zmbot>"
@@
-609,7
+616,7
@@
proc Robot200 {url} {
}
}
text/plain {
}
}
text/plain {
- RobotTextPlain $url $out
+ RobotTextPlain $url $out $outr
}
application/pdf {
set pdff [open test.pdf w]
}
application/pdf {
set pdff [open test.pdf w]
@@
-810,6
+817,17
@@
set idleTime 60000
set i 0
set l [llength $argv]
set i 0
set l [llength $argv]
+# For testing only
+if {0} {
+ set url "http://www.sportsfiskeren.dk/sportsfiskeren/corner/index.htm"
+ set href "../../data/../../data2/newsovs.asp?Mode=5"
+
+ set URL($url,path) /sportsfiskeren/corner/index.htm
+ set URL($url,hostport) www.sportsfiskeren.dk
+ RobotHref $url href host path
+ exit 0
+}
+
if {$l < 2} {
puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]}
puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
if {$l < 2} {
puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]}
puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"