<dialogues>
- <dialogue name="T:/amities/Data/SwitchBoard/ws97.tar/ws97\sw00utt\sw
+_0001_4325.utt" no="1">
- <turn no="1" speaker="A">
- <utt da="bc" id="utt1">
- <![CDATA[ Okay.
]]>
</utt>
- <utt da="qw" id="utt2">
- <![CDATA[ {D So, }
]]>
</utt>
</turn>
- <turn no="2" speaker="B">
- <utt da="qy^d" id="utt1">
- <![CDATA[ [ [ I guess, +
]]>
</utt>
</turn>
- <turn no="3" speaker="A">
- <utt da="+" id="utt1">
- <![CDATA[ What kind of experience [ do you, + do you ] have, then wi
+th child care?
]]>
</utt>
</turn>
- <turn no="4" speaker="B">
- <utt da="+" id="utt1">
My Program
use XML::TreeBuilder;
my $file= 'swbd_50k_42tags.xml';
my $tree = XML::TreeBuilder->new();
$tree->parse_file($file);
foreach my $dialogue ($tree->find_by_tag_name ('dialogue')){
$dialogue_name = $dialogue->attr_get_i('name');
foreach my $turn ($dialogue->find_by_tag_name('turn')){
$turn_no = $turn->attr_get_i('no');
$turn_speaker = $turn->attr_get_i('speaker');
foreach my $utt ($dialogue->find_by_tag_name('utt')){
$da = $utt->attr_get_i('da');
$id = $utt->attr_get_i('id');
$inline = $utt->as_text;
}
}
}
@ngram_length=('1','2','3');
$inline = $utt->as_text;
@words = split(" ",$inline);
@copy = @words;
foreach $ngramlen (@ngram_length) {
@ng = splice @words, 0, $ngramlen-1;
while ($nxt = shift @words) {
push @ng,$nxt;
$ngholder = join(" ",@ng);
print "Current NGRAM:[$ngholder]\n";
shift @ng;
shift @ng = $gram;
}
}
foreach my $utt {
if ($da = sd) {$femi_da eq Statement non opinion}
if (Sda = b) {$femi_da eq Acknowledge-Backchannel}
if ($da = sv) {$femi_da eq Statement-opinion}
if ($da = aa) {$femi_da eq Agree/Accept}
if ($da = %) {$femi_da eq Abandoned}
if ($da = ba) {$femi_da eq Appreciation}
if ($da = qy) {$femi_da eq Yes-No-Question}
if ($da = x) {$femi_da eq Non-Verbal}
if ($da = ny) {$femi_da eq Yes answer}
if ($da = fc) {$femi_da eq Conventional-closing}
if ($da = %) {$femi_da eq uninterpretable}
if ($da = qw) {$femi_da eq Wh-Question}
if ($da = nn} {$femi_da eq No answers}
if ($da = bk} {$femi_da eq Response Acknowledgements}
if ($da = h} {$femi_da eq Hedge}
if ($da = qy^d) {$femi_da eq Decl.Yes-No-Quest}
if ($da = o) {$femi_da eq other}
if ($da = fo) {$femi_da eq other}
if ($da = bc) {$femi_da eq other}
if ($da = by) {$femi_da eq other}
if ($da = fw) {$femi_da eq other}
if ($da = bh) {$femi_da eq Backchannel in quest form}
if ($da = ^q) {$femi_da eq Quotation}
if ($da = bf) {$femi_da eq Summarize}
if ($da = na) {$femi_da eq Affirmative non-yes ans}
if ($da = ny^e) {$femi_da eq Affirmative non-yes ans}
if ($da = ad) {$femi_da eq Action-directive}
if ($da = ^2) {$femi_da eq collab-compl}
if ($da = b^m) {$femi_da eq Repeat-Phrase}
if ($da = qo) {$femi_da eq open-question}
if ($da = qh) {$femi_da eq Rhetorical-Question}
if ($da = ^h) {$femi_da eq hold before anwer-agreement}
if ($da = ar) {$femi_da eq Reject}
if ($da = ng) {$femi_da eq Negayive non-no answer}
if ($da = nn^e) {$femi_da eq Negayive non-no answer}
if ($da = br) {$femi_da eq Signal non-understanding}
if ($da = no) {$femi_da eq other answers}
if ($da = fp) {$femi_da eq Conventional opening}
if ($da = qrr) {$femi_da eq or-clause}
if ($da = arp) {$femi_da eq Dispreferred answer}
if ($da = nd) {$femi_da eq Dispreferred answer}
if ($da = t3) {$femi_da eq 3rd party talk}
if ($da = oo) {$femi_da eq offers,options commit}
if ($da = cc) {$femi_da eq offers,options commit}
if ($da = co) {$femi_da eq offers,options commit}
if ($da = t1) {$femi_da eq self-talk}
if ($da = bd) {$femi_da eq Downplayer}
if ($da = aap) {$femi_da eq maybe/accept-part}
if ($da = am) {$femi_da eq maybe/accept-part}
if ($da = ^g) {$femi_da eq Tag-Question}
if ($da = qw^d) {$femi_da eq Declarative Wh-Quest}
if ($da = fa) {$femi_da eq Apology}
if ($da = ft) {
$femi_da eq Thanking;
}
@counter = ($femi_da , $ngholder);
foreach my $utt {
if ($da = $femi_da) {
$da_counter++;
while ($inline =~ /^$gram\B/) {
$gram_counter++;
It doesnt make a lot of sense. But what i am trying to do is exctract the utterances from the XMl doc and the da's. I then try to extract 0-3 n-grams from each utterance. Each utterance has a DA.
The next step is to get the most frequent n-gram for each utteracnce/DA. basically i need to get a frequent phrase for each DA.
20050329 Janitored by Corion: Added code tags, cleaned formatting, as per the Writeup Formatting Tips
janitored by ybiC: Balanced <readmore> tags around longish codeblock as per Writeup Formatting Tips, retitle from one-word "Counting" to help site search