読者です 読者をやめる 読者になる 読者になる

EntryFullTextでCSSセレクターを使う

plagger

エントリー書くか、という時になってPlaggerCookbookのレシピに

  • Use XPath and CSS Selector to extract fulltext content
無料でメル友を作る為のサイトplagger

なんて書いてあるのをみつけた。既にあるのかな?だいぶ前に

CSS selector で抽出かけるのは Plagger にもほしいかも

CSS selector to XPath - Bulknews::Subtech - subtech

って書いてあるし、HTML::Selector::XPathmiyagawaさん作なのでありそうだけど、とりあえず習作ということで書いたやつを貼ってみる。

assets/plugins/Filter-EntryFullText/*.yamlxpathなら

extract_xpath:
  title: //h2[@id="title"]
  body: //div[@class="section"]

とか書くところを

extract_selector:
  title: h2#title
  body: div.section

なんていう風に書ける。

Index: EntryFullText.pm
===================================================================
--- EntryFullText.pm	(リビジョン 1947)
+++ EntryFullText.pm	(作業コピー)
@@ -258,7 +258,7 @@
     my($self, $args) = @_;
     my $data;
 
-    unless ($self->{extract} || $self->{extract_xpath}) {
+    unless ($self->{extract} || $self->{extract_xpath} || $self->{extract_selector}) {
         Plagger->context->log(error => "YAML doesn't have either 'extract' nor 'extract_xpath'");
         return;
     }
@@ -271,19 +271,35 @@
 	}
     }
 
-    if ($self->{extract_xpath}) {
+    if ($self->{extract_xpath} || $self->{extract_selector}) {
         eval { require HTML::TreeBuilder::XPath };
         if ($@) {
             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
             return;
         }
 
+        my $selector = eval {
+            require HTML::Selector::XPath;
+            HTML::Selector::XPath->new;
+        };
+
+        if ($self->{extract_selector} && $@) {
+            Plagger->context->log(error => "HTML::Selector::XPath is required. $@");
+            return;
+        }
+
+        my $extractor = $self->{extract_selector} ? 'extract_selector' : 'extract_xpath';
+
         my $tree = HTML::TreeBuilder::XPath->new;
         $tree->parse($args->{content});
         $tree->eof;
 
-        for my $capture (keys %{$self->{extract_xpath}}) {
-            my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
+        for my $capture (keys %{$self->{$extractor}}) {
+            my $xpath = $self->{extract_xpath}->{$capture} || do {
+                $selector->selector($self->{extract_selector}->{$capture});
+                $selector->to_xpath;
+            };
+            my @children = $tree->findnodes($xpath);
             if (@children) {
                 no warnings 'redefine';
                 local *HTML::Element::_xml_escape = \&xml_escape;
@@ -291,7 +307,7 @@
                     ? $children[0]->as_XML
                     : $children[0]->getValue;
             } else {
-                Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
+                Plagger->context->log(error => "Can't find node matching $self->{$extractor}->{$capture}");
             }
         }
     }