articles.bib
@comment{{This file has been generated by bib2bib 1.96}}
@comment{{Command line: ./bib2bib -ob articles.bib -c '$type = "ARTICLE"' -s year -r mediamill.bib}}
@article{HabibianCVIU14,
author = {Amirhossein Habibian and Cees G. M. Snoek},
title = {Recommendations for Recognizing Video Events by Concept Vocabularies},
journal = {Computer Vision and Image Understanding},
pages = {},
month = {},
year = {2014},
volume = {},
number = {},
pdf = {},
note = {In press},
abstract = {
}
}
@article{KordumovaMMTA14,
author = {Svetlana Kordumova and Xirong Li and Cees G. M. Snoek},
title = {Best Practices for Learning Video Concept Detectors from Social Media Examples},
journal = {Multimedia Tools and Applications},
pages = {},
month = {},
year = {2014},
volume = {},
number = {},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/kordumova-practices-mmta.pdf},
note = {In press},
abstract = {
Learning video concept detectors from social media sources, such as Flickr images and YouTube videos, has the potential to address a wide variety of concept queries for video search. While the potential has been recognized by many, and progress on the topic has been impressive, we argue that key questions crucial to know how to learn effective video concept detectors from social media examples? remain open. As an initial attempt to answer these questions, we conduct an experimental study using a video search engine which is capable of learning concept detectors from social media examples, be it socially tagged videos or socially tagged images. Within the video search engine we investigate three strategies for positive example selection, three negative example selection strategies and three learning strategies. The performance is evaluated on the challenging TRECVID 2012 benchmark consisting of 600 h of Internet video. From the experiments we derive four best practices: (1) tagged images are a better source for learning video concepts than tagged videos, (2) selecting tag relevant positive training examples is always beneficial, (3) selecting relevant negative examples is advantageous and should be treated differently for video and image sources, and (4) learning concept detectors with selected relevant training data before learning is better then incorporating the relevance during the learning process. The best practices within our video search engine lead to state-of-the-art performance in the TRECVID 2013 benchmark for concept detection without manually provided annotations.
}
}
@article{MyersMVA14,
author = {Gregory K. Myers and Ramesh Nallapati and Julien {van Hout} and Stephanie Pancoast and Ram Nevatia and Chen Sun and Amirhossein Habibian and Dennis C. Koelma and Koen E. A. van de Sande and Arnold W. M. Smeulders and Cees G. M. Snoek},
title = {Evaluating Multimedia Features and Fusion for Example-based Event Detection},
journal = {Machine Vision and Applications},
pages = {17-32},
month = {January},
year = {2014},
volume = {25},
number = {1},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/myers-features-fusion-events-mva.pdf},
abstract = {
Multimedia event detection (MED)is a challenging
problem because of the heterogeneous content and variable
quality found in large collections of Internet videos. To
study the value of multimedia features and fusion for representing
and learning events from a set of example video clips,
we created SESAME, a system for video SEarch with Speed
and Accuracy for Multimedia Events. SESAME includes
multiple bag-of-words event classifiers based on single data
types: low-level visual, motion, and audio features; high-level
semantic visual concepts; and automatic speech recognition.
Event detection performance was evaluated for each
event classifier. The performance of low-level visual and
motion features was improved by the use of difference coding.
The accuracy of the visual concepts was nearly as strong
as that of the low-level visual features. Experiments with a
number of fusion methods for combining the event detection
scores from these classifiers revealed that simple fusion
methods, such as arithmetic mean, perform as well as or better
than other, more complex fusion methods. SESAME's
performance in the 2012 TRECVID MED evaluation was
one of the best reported.
}
}
@article{LiTMM13,
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
title = {Bootstrapping Visual Categorization with Relevant Negatives},
journal = {{IEEE} Transactions on Multimedia},
pages = {933--945},
month = {June},
year = {2013},
volume = {15},
number = {4},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-negative-tmm.pdf},
abstract = {
Learning classifiers for many visual concepts are important
for image categorization and retrieval. As a classifier tends
to misclassify negative examples which are visually similar to positive
ones, inclusion of such misclassified and thus relevant negatives
should be stressed during learning. User-tagged images are abundant
online, but which images are the relevant negatives remains
unclear. Sampling negatives at random is the de facto standard in
the literature. In this paper, we go beyond random sampling by
proposing Negative Bootstrap. Given a visual concept and a few
positive examples, the new algorithm iteratively finds relevant negatives.
Per iteration, we learn from a small proportion of many
user-tagged images, yielding an ensemble of meta classifiers. For
efficient classification, we introduce Model Compression such that
the classification time is independent of the ensemble size. Compared
with the state of the art, we obtain relative gains of 14\% and
18\% on two present-day benchmarks in terms of mean average
precision. For concept search in one million images, model compression
reduces the search time from over 20 h to approximately
6 min. The effectiveness and efficiency, without the need of manually
labeling any negatives, make negative bootstrap appealing for
learning better visual concept classifiers.
}
}
@article{HuurninkTMM12,
author = {Bouke Huurnink and Cees G. M. Snoek and Maarten {de Rijke} and Arnold W. M. Smeulders},
title = {Content-Based Analysis Improves Audiovisual Archive Retrieval},
journal = {{IEEE} Transactions on Multimedia},
pages = {1166--1178},
month = {August},
year = {2012},
volume = {14},
number = {4},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/huurnink-archive-tmm.pdf},
abstract = {
Content-based video retrieval is maturing to the point where it
can be used in real-world retrieval practices. One such practice
is the audiovisual archive, whose users increasingly require
fine-grained access to broadcast television content. In this
paper, we take into account the information needs and retrieval
data already present in the audiovisual archive, and demonstrate
that retrieval performance can be significantly improved when
content-based methods are applied to search. To the best of our
knowledge, this is the first time that the practice of an
audiovisual archive has been taken into account for quantitative
retrieval evaluation. To arrive at our main result, we propose
an evaluation methodology tailored to the specific needs and
circumstances of the audiovisual archive, which are typically
missed by existing evaluation initiatives. We utilize logged
searches, content purchases, session information, and simulators
to create realistic query sets and relevance judgments. To
reflect the retrieval practice of both the archive and the video
retrieval community as closely as possible, our experiments with
three video search engines incorporate archive-created catalog
entries as well as state-of-the-art multimedia content analysis
results. A detailed query-level analysis indicates that
individual content-based retrieval methods such as
transcript-based retrieval and concept-based retrieval yield
approximately equal performance gains. When combined, we find
that content-based video retrieval incorporated into the
archive’s practice results in significant performance increases
for shot retrieval and for retrieving entire television programs.
The time has come for audiovisual archives to start accommodating
content-based video retrieval methods into their daily practice.
}
}
@article{LiTMM12,
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
title = {Harvesting Social Images for Bi-Concept Search},
journal = {{IEEE} Transactions on Multimedia},
pages = {1091--1104},
month = {August},
year = {2012},
volume = {14},
number = {4},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-biconcept-tmm.pdf},
abstract = {
Searching for the co-occurrence of two visual concepts in unlabeled
images is an important step towards answering complex user queries.
Traditional visual search methods use combinations of the confidence
scores of individual concept detectors to tackle such queries. In
this paper we introduce the notion of bi-concepts, a new concept-based
retrieval method that is directly learned from social-tagged images.
As the number of potential bi-concepts is gigantic, manually collecting
training examples is infeasible. Instead, we propose a multimedia
framework to collect de-noised positive as well as informative negative
training examples from the social web, to learn bi-concept detectors
from these examples, and to apply them in a search engine for retrieving
bi-concepts in unlabeled images. We study the behavior of our bi-concept
search engine using 1.2M social-tagged images as a data source. Our
experiments indicate that harvesting examples for bi-concepts differs
from traditional single-concept methods, yet the examples can be
collected with high accuracy using a multi-modal approach. We find
that directly learning bi-concepts is better than oracle linear fusion
of single-concept detectors, with a relative improvement of 100\%.
This study reveals the potential of learning high-order semantics
from social images, for free, suggesting promising new lines of research.
}
}
@article{GavvesCVIU12,
author = {Efstratios Gavves and Cees G. M. Snoek and Arnold W. M. Smeulders},
title = {Visual Synonyms for Landmark Image Retrieval},
journal = {Computer Vision and Image Understanding},
pages = {238--249},
month = {February},
year = {2012},
volume = {116},
number = {2},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gavves-synonyms-cviu.pdf},
abstract = {
In this paper, we address the incoherence problem of the visual
words in bag-of-words vocabularies. Different from existing work,
which assigns words based on closeness in descriptor space, we
focus on identifying pairs of independent, distant words -- the
visual synonyms -- that are likely to host image patches of similar
visual reality. We focus on landmark images, where the image geometry
guides the detection of synonym pairs. Image geometry is used to
find those image features that lie in the nearly identical physical
location, yet are assigned to different words of the visual
vocabulary. Defined in this way, we evaluate the validity of visual
synonyms. We also examine the closeness of synonyms in the
L2-normalized feature space. We show that visual synonyms may
successfully be used for vocabulary reduction. Furthermore, we show
that combining the reduced visual vocabularies with synonym
augmentation, we perform on par with the state-of-the-art
bag-of-words approach, while having a 98\% smaller vocabulary.
}
}
@article{StegginkMS11,
author = {Jeroen Steggink and Cees G. M. Snoek},
title = {Adding Semantics to Image-Region Annotations with the Name-It-Game},
journal = {Multimedia Systems},
pages = {367--378},
month = {October},
year = {2011},
volume = {17},
number = {5},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/steggink-name-it-game-mmsys.pdf},
abstract = {
In this paper we present the Name-It-Game, an interactive multimedia
game fostering the swift creation of a large data set of region-based
image annotations. Compared to existing annotation games, we consider
an added semantic structure, by means of the WordNet ontology, the main
innovation of the Name-It-Game. Using an ontology-powered game, instead
of the more traditional annotation tools, potentially makes region-based
image labeling more fun and accessible for every type of user. However,
the current games often present the players with hard-to-guess objects.
To prevent this from happening in the Name-It-Game, we successfully
identify WordNet categories which filter out hard-to-guess objects. To
verify the speed of the annotation process, we compare the online
Name-It-Game with a desktop tool with similar features. Results show
that the Name-It-Game outperforms this tool for semantic region-based
image labeling. Lastly, we measure the accuracy of the produced
segmentations and compare them with carefully created LabelMe
segmentations. Judging from the quantitative and qualitative results,
we believe the segmentations are competitive to those of LabelMe,
especially when averaged over multiple games. By adding semantics to
region-based image annotations, using the Name-It-Game, we have opened
up an efficient means to provide precious labels in a playful manner.
}
}
@article{SandeTMM11,
author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
title = {Empowering Visual Categorization with the {GPU}},
journal = {{IEEE} Transactions on Multimedia},
pages = {60--70},
month = {February},
year = {2011},
volume = {13},
number = {1},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-categorization-gpu-tmm.pdf},
abstract = {
Visual categorization is important to manage large collections of
digital images and video, where textual meta-data is often incomplete
or simply unavailable. The bag-of-words model has become the most
powerful method for visual categorization of images and video.
Despite its high accuracy, a severe drawback of this model is its
high computational cost. As the trend to increase computational
power in newer CPU and GPU architectures is to increase their level
of parallelism, exploiting this parallelism becomes an important
direction to handle the computational cost of the bag-of-words
approach. When optimizing a system based on the bag-of-words approach,
the goal is to minimize the time it takes to process batches of
images. Additionally, we also consider power usage as an evaluation
metric. In this paper, we analyze the bag-of-words model for visual
categorization in terms of computational cost and identify two major
bottlenecks: the quantization step and the classification step. We
address these two bottlenecks by proposing two efficient algorithms
for quantization and classification by exploiting the GPU hardware
and the CUDA parallel programming model. The algorithms are designed
to (1) keep categorization accuracy intact, (2) decompose the problem
and (3) give the same numerical results. In the experiments on large
scale datasets it is shown that, by using a parallel implementation
on the Geforce GTX260 GPU, classifying unseen images is 4.8 times
faster than a quad-core CPU version on the Core i7 920, while giving
the exact same numerical results. In addition, we show how the
algorithms can be generalized to other applications, such as text
retrieval and video retrieval. Moreover, when the obtained speedup is
used to process extra video frames in a video retrieval benchmark,
the accuracy of visual categorization is improved by 29\%.
}
}
@article{SandePAMI10,
author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
title = {Evaluating Color Descriptors for Object and Scene Recognition},
journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
pages = {1582--1596},
month = {September},
year = {2010},
volume = {32},
number = {9},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-colordescriptors-pami.pdf},
software = {http://www.colordescriptors.com},
abstract = {
Image category recognition is important to access visual information
on the level of objects and scene types. So far, intensity-based
descriptors have been widely used for feature extraction at salient
points. To increase illumination invariance and discriminative power,
color descriptors have been proposed. Because many different descriptors
exist, a structured overview is required of color invariant descriptors
in the context of image category recognition. Therefore, this paper
studies the invariance properties and the distinctiveness of color
descriptors in a structured way. The analytical invariance properties
of color descriptors are explored, using a taxonomy based on invariance
properties with respect to photometric transformations, and tested
experimentally using a dataset with known illumination conditions. In
addition, the distinctiveness of color descriptors is assessed
experimentally using two benchmarks, one from the image domain and one
from the video domain. From the theoretical and experimental results,
it can be derived that invariance to light intensity changes and light
color changes affects category recognition. The results reveal further
that, for light intensity changes, the usefulness of invariance is
category-specific. Overall, when choosing a single descriptor and no
prior knowledge about the dataset and object and scene categories is
available, the OpponentSIFT is recommended. Furthermore, a combined set
of color descriptors outperforms intensity-based SIFT and improves
category recognition by 8\% on the PASCAL VOC 2007 and by 7\% on the
MediaMill Challenge.
}
}
@article{ByrneMMTA10,
author = {Daragh Byrne and Aiden R. Doherty and Cees G. M. Snoek and Gareth J. F. Jones and Alan F. Smeaton},
title = {Everyday Concept Detection in Visual Lifelogs: Validation, Relationships and Trends},
journal = {Multimedia Tools and Applications},
pages = {119--144},
month = {August},
year = {2010},
volume = {49},
number = {1},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/byrne-everyday-concept-detection-mmta.pdf},
abstract = {
The Microsoft SenseCam is a small lightweight wearable camera used to
passively capture photos and other sensor readings from a user’s day-to-day activities.
It captures on average 3,000 images in a typical day, equating to almost 1 million
images per year. It can be used to aid memory by creating a personal multimedia
lifelog, or visual recording of the wearer’s life. However the sheer volume of image
data captured within a visual lifelog creates a number of challenges, particularly for
locating relevant content. Within this work, we explore the applicability of semantic
concept detection, a method often used within video retrieval, on the domain of
visual lifelogs. Our concept detector models the correspondence between low-level
visual features and high-level semantic concepts (such as indoors, outdoors, people,
buildings, etc.) using supervised machine learning. By doing so it determines the
probability of a concept’s presence. We apply detection of 27 everyday semantic
concepts on a lifelog collection composed of 257,518 SenseCam images from 5
users. The results were evaluated on a subset of 95,907 images, to determine the
accuracy for detection of each semantic concept. We conducted further analysis
on the temporal consistency, co-occurance and relationships within the detected
concepts to more extensively investigate the robustness of the detectors within this
domain.
}
}
@article{SnoekCOM10,
author = {Cees G. M. Snoek and Arnold W. M. Smeulders},
title = {Visual-Concept Search Solved?},
journal = {{IEEE} Computer},
pages = {76--78},
month = {June},
year = {2010},
volume = {43},
number = {6},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-smeulders-solved-computer.pdf},
abstract = {
Progress in visual-concept search suggests that machine understanding of images
is within reach.
}
}
@article{RooijCGA10,
author = {Ork de Rooij and Marcel Worring and Jack J. van Wijk},
title = {MediaTable: Interactive Categorization of Multimedia Collections},
journal = {IEEE Computer Graphics and Applications},
pages = {42--51},
month = {May},
year = {2010},
volume = {30},
number = {5},
pdf = {http://www.science.uva.nl/research/publications/2010/deRooijCGA2010},
abstract = {
}
}
@article{GemertCVIU10,
author = {Jan C. van Gemert and Cees G. M. Snoek and Cor J. Veenman and Arnold W. M. Smeulders and Jan-Mark Geusebroek},
title = {Comparing Compact Codebooks for Visual Categorization},
journal = {Computer Vision and Image Understanding},
pages = {450--462},
month = {April},
year = {2010},
volume = {114},
number = {4},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gemert-compact-codebooks-cviu.pdf},
abstract = {
In the face of current large-scale video libraries, the practical applicability of
content-based indexing algorithms is constrained by their efficiency. This paper
strives for efficient large-scale video indexing by comparing various visual-based
concept categorization techniques. In visual categorization, the popular codebook
model has shown excellent categorization performance. The codebook model represents
continuous visual features by discrete prototypes predefined in a vocabulary. The
vocabulary size has a major impact on categorization efficiency, where a more compact
vocabulary is more efficient. However, smaller vocabularies typically score lower on
classification performance than larger vocabularies. This paper compares four approaches
to achieve a compact codebook vocabulary while retaining categorization performance.
For these four methods, we investigate the trade-off between codebook compactness
and categorization performance. We evaluate the methods on more than 200 h of challenging
video data with as many as 101 semantic concepts. The results allow us to create a
taxonomy of the four methods based on their efficiency and categorization performance.
}
}
@article{RooijTMM10,
author = {Ork de Rooij and Marcel Worring},
title = {Browsing Video Along Multiple Threads},
journal = {{IEEE} Transactions on Multimedia},
pages = {121--130},
month = {February},
year = {2010},
volume = {12},
number = {2},
pdf = {http://www.science.uva.nl/research/publications/2010/deRooijITM2010},
abstract = {
}
}
@article{LiTMM09,
author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
title = {Learning Social Tag Relevance by Neighbor Voting},
journal = {{IEEE} Transactions on Multimedia},
pages = {1310--1322},
month = {November},
year = {2009},
volume = {11},
number = {7},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-socialtagrelevance-tmm.pdf},
abstract = {
Social image analysis and retrieval is important
for helping people organize and access the increasing amount
of user-tagged multimedia. Since user tagging is known to be
uncontrolled, ambiguous, and overly personalized, a fundamental
problem is how to interpret the relevance of a user-contributed
tag with respect to the visual content the tag is describing.
Intuitively, if different persons label visually similar images using
the same tags, these tags are likely to reflect objective aspects
of the visual content. Starting from this intuition, we propose
in this paper a neighbor voting algorithm which accurately and
efficiently learns tag relevance by accumulating votes from visual
neighbors. Under a set of well defined and realistic assumptions,
we prove that our algorithm is a good tag relevance measurement
for both image ranking and tag ranking. Three experiments on
3.5 million Flickr photos demonstrate the general applicability
of our algorithm in both social image retrieval and image tag
suggestion. Our tag relevance learning algorithm substantially
improves upon baselines for all the experiments. The results
suggest that the proposed algorithm is promising for real-world
applications.
}
}
@article{SnoekFNTIR09,
author = {Cees G. M. Snoek and Marcel Worring},
title = {Concept-Based Video Retrieval},
journal = {Foundations and Trends in Information Retrieval},
pages = {215--322},
year = {2009},
volume = {4},
number = {2},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-concept-based-video-retrieval-fntir.pdf},
abstract = {
In this paper, we review 300 references on video retrieval, indicating
when text-only solutions are unsatisfactory and showing the promising
alternatives which are in majority concept-based. Therefore, central
to our discussion is the notion of a semantic concept: an objective
linguistic description of an observable entity. Specifically, we present
our view on how its automated detection, selection under uncertainty,
and interactive usage might solve the major scientific problem for video
retrieval: the semantic gap. To bridge the gap, we lay down the anatomy
of a concept-based video search engine. We present a component-wise
decomposition of such an interdisciplinary multimedia system, covering
influences from information retrieval, computer vision, machine learning,
and human-computer interaction. For each of the components we
review state-of-the-art solutions in the literature, each having different
characteristics and merits. Because of these differences, we cannot
understand the progress in video retrieval without serious evaluation
efforts such as carried out in the NIST TRECVID benchmark. We
discuss its data, tasks, results, and the many derived community
initiatives in creating annotations and baselines for repeatable experiments.
We conclude with our perspective on future challenges and
opportunities.
}
}
@article{SmeatonIJIST08,
author = {Alan F. Smeaton and Peter Wilkins and Marcel Worring and Ork de Rooij and Tat-Seng Chua and Huanbo Luan},
title = {Content-based Video Retrieval: Three Example Systems from {TRECVid}},
journal = {International Journal of Imaging Systems and Technology},
year = {2008},
volume = {18},
number = {2--3},
pages = {195--201},
pdf = {},
abstract = {
}
}
@article{NguyenJVLC08,
author = {Giang P. Nguyen and Marcel Worring},
title = {Interactive Access to Large Image Collections using Similarity-based Visualization},
journal = {Journal of Visual Languages and Computing},
month = {April},
year = {2008},
volume = {19},
number = {2},
pages = {203--224},
pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-similarity-visualization-jvlc.pdf},
abstract = {
Image collections are getting larger and larger. To access those
collections, systems for managing, searching, and browsing are
necessary. Visualization plays an essential role in such systems.
Existing visualization systems do not analyze all the problems
occurring when dealing with large visual collections. In this
paper, we make these problems explicit. From there, we establish
three general requirements: overview, visibility, and structure
preservation. Solutions for each requirement are proposed, as well
as functions balancing the different requirements. We present an
optimal visualization scheme, supporting users in interacting with
large image collections. Experimental results with a collection of
10,000 Corel images, using simulated user actions, show that the
proposed scheme significantly improves performance for a given
task compared to the 2D grid-based visualizations commonly used in
content-based image retrieval.
}
}
@article{SnoekMM08,
author = {Cees G. M. Snoek and Marcel Worring and Ork de Rooij and Koen E. A. {van de Sande} and Rong Yan and Alexander G. Hauptmann},
title = {{VideOlympics}: Real-Time Evaluation of Multimedia Retrieval Systems},
journal = {{IEEE} MultiMedia},
pages = {86--91},
month = {January--March},
year = {2008},
volume = {15},
number = {1},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-videolympics-mm.pdf},
abstract = {
Video search is an experience for the senses. As a result, traditional
information retrieval metrics can't fully measure the quality of a video
search system. To provide a more interactive assessment of today's video
search engines, the authors have organized the VideOlympics as a real-time
evaluation showcase where systems compete to answer specific video searches
in front of a live audience. At VideOlympics, seeing and hearing is believing.
}
}
@article{NguyenTOMCCAP08,
author = {Giang P. Nguyen and Marcel Worring},
title = {Optimization of Interactive Visual-Similarity-Based Search},
journal = {{ACM} Transactions on Multimedia Computing, Communications and Applications},
month = {January},
year = {2008},
volume = {4},
number = {1},
pages = {7:1--23},
pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-optimization-tomccap.pdf},
abstract = {
At one end of the spectrum, research in interactive content-based
retrieval concentrates on machine learning methods for effective
use of relevance feedback. On the other end, the information
visualization community focuses on effective methods for conveying
information to the user. What is lacking is research considering
the information visualization and interactive retrieval as truly
integrated parts of one content-based search system. In such an
integrated system, there are many degrees of freedom like the
similarity function, the number of images to display, the image
size, different visualization modes, and possible feedback modes.
To base the optimal values for all of those on user studies is
unfeasible. We therefore develop search scenarios in which tasks
and user actions are simulated. From there, the proposed scheme is
optimized based on objective constraints and evaluation criteria.
In such a manner, the degrees of freedom are reduced and the
remaining degrees can be evaluated in user studies. In this article,
we present a system that integrates advanced similarity based
visualization with active learning. We have performed extensive
experimentation on interactive category search with different
image collections. The results using the proposed simulation
scheme show that indeed the use of advanced visualization and
active learning pays off in all of these datasets.
}
}
@article{NguyenTMM07,
author = {Giang P. Nguyen and Marcel Worring and Arnold W. M. Smeulders},
title = {Interactive Search by Direct Manipulation of Dissimilarity Space},
journal = {{IEEE} Transactions on Multimedia},
month = {November},
year = {2007},
volume = {9},
number = {7},
pages = {1404--1415},
pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-dissimilarity-tmm.pdf},
abstract = {
In this paper, we argue to learn dissimilarity for interactive search in
content based image retrieval. In literature, dissimilarity is often learned
via the feature space by feature selection, feature weighting or by adjusting
the parameters of a function of the features. Other than existing techniques,
we use feedback to adjust the dissimilarity space independent of feature space.
This has the great advantage that it manipulates dissimilarity directly. To
create a dissimilarity space, we use the method proposed by Pekalska and Duin,
selecting a set of images called prototypes and computing distances to those
prototypes for all images in the collection. After the user gives feedback,
we apply active learning with a one-class support vector machine to decide the
movement of images such that relevant images stay close together while irrelevant
ones are pushed away (the work of Guo ). The dissimilarity space is then adjusted
accordingly. Results on a Corel dataset of 10000 images and a TrecVid collection
of 43907 keyframes show that our proposed approach is not only intuitive, it
also significantly improves the retrieval performance.
}
}
@article{SeinstraMM07,
author = {Frank J. Seinstra and Jan-Mark Geusebroek and Dennis Koelma and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
title = {High-Performance Distributed Image and Video Content Analysis with Parallel-Horus},
journal = {{IEEE} MultiMedia},
pages = {64--75},
month = {October--December},
year = {2007},
volume = {14},
number = {4},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/seinstra-parallel-horus-mm.pdf},
abstract = {
As the world uses more digital video that requires greater storage space,
Grid computing is becoming indispensable for urgent problems in multimedia
content analysis. Parallel-Horus, a support tool for applications in multimedia
Grid computing, lets users implement multimedia applications as sequential
programs for efficient execution on clusters and Grids, based on wide-area
multimedia services.
}
}
@article{SnoekTMM07b,
author = {Cees G. M. Snoek and Bouke Huurnink and Laura Hollink and Maarten de Rijke and Guus Schreiber and Marcel Worring},
title = {Adding Semantics to Detectors for Video Retrieval},
journal = {{IEEE} Transactions on Multimedia},
month = {August},
year = {2007},
volume = {9},
number = {5},
pages = {975--986},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-semantics2detectors-tmm.pdf},
abstract = {
In this paper, we propose an automatic video retrieval method based on high-level
concept detectors. Research in video analysis has reached the point where over 100
concept detectors can be learned in a generic fashion, albeit with mixed performance.
Such a set of detectors is very small still compared to ontologies aiming to capture
the full vocabulary a user has. We aim to throw a bridge between the two fields by
building a multimedia thesaurus, i.e., a set of machine learned concept detectors
that is enriched with semantic descriptions and semantic structure obtained from
WordNet. Given a multimodal user query, we identify three strategies to select a
relevant detector from this thesaurus, namely: text matching, ontology querying,
and semantic visual querying. We evaluate the methods against the automatic search
task of the TRECVID 2005 video retrieval benchmark, using a news video archive of
85 h in combination with a thesaurus of 363 machine learned concept detectors. We
assess the influence of thesaurus size on video search performance, evaluate and
compare the multimodal selection strategies for concept detectors, and finally
discuss their combined potential using oracle fusion. The set of queries in the
TRECVID 2005 corpus is too small for us to be definite in our conclusions, but the
results suggest promising new lines of research.
}
}
@article{WorringTMM07,
author = {Marcel Worring and Guus Schreiber},
title = {Semantic Image and Video Indexing in Broad Domains},
journal = {{IEEE} Transactions on Multimedia},
month = {August},
year = {2007},
volume = {9},
number = {5},
pages = {909--911},
pdf = {http://www.science.uva.nl/research/mediamill/pub/worring-special-issue-tmm.pdf},
abstract = {
The six papers in this special section focus on semantic image and
video indexing in broad domains. To bring semantics to the user in
broad domains both the indexing and retrieval step have to be considered.
The papers here address both steps and the relation to ontologies.
}
}
@article{SnoekTMM07,
author = {Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
title = {A Learned Lexicon-Driven Paradigm for Interactive Video Retrieval},
journal = {{IEEE} Transactions on Multimedia},
month = {February},
year = {2007},
volume = {9},
number = {2},
pages = {280--292},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-lexicon-tmm.pdf},
abstract = {
Effective video retrieval is the result of an interplay between
interactive query selection, advanced visualization of results, and
a goal-oriented human user. Traditional interactive video retrieval
approaches emphasize paradigms, such as query-by-keyword and
query-by-example, to aid the user in the search for relevant
footage. However, recent results in automatic indexing indicate that
query-by-concept is becoming a viable resource for interactive
retrieval also. We propose in this paper a new video retrieval
paradigm. The core of the paradigm is formed by first detecting a
large lexicon of semantic concepts. From there, we combine
query-by-concept, query-by-example, query-by-keyword, and user
interaction into the \emph{MediaMill} semantic video search engine.
To measure the impact of increasing lexicon size on interactive
video retrieval performance, we performed two experiments against
the 2004 and 2005 NIST TRECVID benchmarks, using lexicons containing
32 and 101 concepts respectively. The results suggest that from all
factors that play a role in interactive retrieval, a large lexicon
of semantic concepts matters most. Indeed, by exploiting large
lexicons, many video search questions are solvable without using
query-by-keyword and query-by-example. What is more, we show that
the lexicon-driven search engine outperforms all state-of-the-art
video retrieval systems in both TRECVID 2004 and 2005.
}
}
@article{SnoekPAMI06,
author = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra and Arnold W. M. Smeulders},
title = {The Semantic Pathfinder: Using an Authoring Metaphor for Generic Multimedia Indexing},
journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
month = {October},
year = {2006},
volume = {28},
number = {10},
pages = {1678--1689},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-pathfinder-pami.pdf},
abstract = {
This paper presents the semantic pathfinder architecture for
generic indexing of multimedia archives. The semantic pathfinder
extracts semantic concepts from video by exploring different paths
through three consecutive analysis steps, which we derive from the
observation that produced video is the result of an
authoring-driven process. We exploit this \emph{authoring
metaphor} for machine-driven understanding. The pathfinder starts
with the content analysis step. In this analysis step, we follow a
data-driven approach of indexing semantics. The style analysis
step is the second analysis step. Here we tackle the indexing
problem by viewing a video from the perspective of production.
Finally, in the context analysis step, we view semantics in
context. The virtue of the semantic pathfinder is its ability to
learn the best path of analysis steps on a per-concept basis. To
show the generality of this novel indexing approach we develop
detectors for a lexicon of 32 concepts and we evaluate the
semantic pathfinder against the 2004 NIST TRECVID video retrieval
benchmark, using a news archive of 64 hours. Top ranking
performance in the semantic concept detection task indicates the
merit of the semantic pathfinder for generic indexing of
multimedia archives.
}
}
@article{SnoekTOMCCAP06,
author = {Cees G. M. Snoek and Marcel Worring and Alexander G. Hauptmann},
title = {Learning Rich Semantics from News Video Archives by Style Analysis},
journal = {{ACM} Transactions on Multimedia Computing, Communications and Applications},
month = {May},
year = {2006},
volume = {2},
number = {2},
pages = {91--108},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-style-tomccap.pdf},
abstract = {
We propose a generic and robust framework for news video indexing, which
we found on a broadcast news production model. We identify within this
model four production phases, each providing useful metadata for annotation.
In contrast to semi-automatic indexing approaches, which exploit this
information at production time, we adhere to an automatic data-driven
approach. To that end, we analyze a digital news video using a separate
set of multimodal detectors for each production phase. By combining the
resulting production-derived features into a statistical classifier
ensemble, the framework facilitates robust classification of several rich
semantic concepts in news video; rich meaning that concepts share many
similarities in their production process. Experiments on an archive of
120 hours of news video, from the 2003 TRECVID benchmark, show that a
combined analysis of production phases yields the best results. In addition,
we demonstrate that the accuracy of the proposed style analysis framework
for classification of several rich semantic concepts is state-of-the-art.
}
}
@article{HollinkVISP05,
author = {Laura Hollink and Giang Nguyen and Dennis C. Koelma and Guus Schreiber and Marcel Worring},
title = {Assessing user behaviour in news video retrieval},
journal = {IEE on Vision, Image and Signal Processing},
month = {December},
year = {2005},
volume = {152},
number = {6},
pages = {911-918},
pdf = {http://staff.science.uva.nl/~giangnp/pubs/pdf/2005/Hollink05.pdf},
abstract = {
The results of a study are presented, in which people queried a news
archive using an interactive video retrieval system. 242 search sessions
by 39 participants on 24 topics were assessed. Before, during and after
the study, participants filled in questionnaires about their expectations
of a search. The questionnaire data, logged user actions on the system,
queries formulated by users, and a quality measure of each search were
studied. The results of the study show that topics concerning 'specific'
people or objects were better retrieved than topics concerning 'general'
objects and scenes. Users were able to estimate the overall quality of a
search but did not know when the optimal result was reached within the
search process. Analysis of the results at various stages in the retrieval
process suggests that retrieval based on transcriptions of the speech in
video data adds more to the average precision of the result than
content-based image retrieval based on low-level visual features. The
latter is particularly useful in providing the user with an overview of
the dataset and thus an indication of the success of a search. Based on
the results, implications for the design of user interfaces of video
retrieval systems are discussed.
}
}
@article{NguyenMS05,
author = {Giang P. Nguyen and Marcel Worring},
title = {Relevance feedback based saliency adaptation in {CBIR}},
journal = {{ACM} Springer Multimedia Systems},
month = {October},
year = {2005},
volume = {10},
number = {6},
pages = {499--512},
pdf = {http://staff.science.uva.nl/~giangnp/pubs/pdf/2005/giangACM_MS05.pdf},
abstract = {
Content-based image retrieval ({CBIR}) has been under investigation
for a long time with many systems built to meet different
application demands. However, in all systems, there is still a gap
between the user's expectation and the system's retrieval
capabilities. Therefore, user interaction is an essential
component of any {CBIR} system. Interaction up to now has mostly
focused on changing global image features or similarities between
images. We consider the interaction with salient details in the
image i.e. points, lines, and regions. Interactive salient detail
definition goes further than summarizing the image into a set of
salient details. We aim to dynamically update the user- and
context-dependent definition of saliency based on relevance
feedback. To that end, we propose an interaction framework for
salient details from the perspective of the user. A number of
instantiations of the framework are presented. Finally, we apply
our approach for query refinement in detail based image retrieval
system with salient points and regions. Experimental results prove
the effectiveness of adapting the saliency from user feedback in
the retrieval process.
}
}
@article{SnoekTMM05,
author = {Cees G. M. Snoek and Marcel Worring},
title = {Multimedia Event-Based Video Indexing using Time Intervals},
journal = {{IEEE} Transactions on Multimedia},
month = {August},
year = {2005},
volume = {7},
number = {4},
pages = {638--647},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-time-mm.pdf},
abstract = {
We propose the Time Interval Multimedia Event (TIME) framework as a robust
approach for classification of semantic events in multimodal video documents.
The representation used in TIME extends the Allen time relations and allows
for proper inclusion of context and synchronization of the heterogeneous
information sources involved in multimodal video analysis. To demonstrate the
viability of our approach, it was evaluated on the domains of soccer and news
broadcasts. For automatic classification of semantic events, we compare three
different machine learning techniques, i.c. C4.5 decision tree, Maximum
Entropy, and Support Vector Machine. The results show that semantic video
indexing results significantly benefit from using the TIME framework.
}
}
@article{SnoekMTAP05,
author = {Cees G. M. Snoek and Marcel Worring},
title = {Multimodal Video Indexing: A Review of the State-of-the-art},
journal = {Multimedia Tools and Applications},
month = {January},
year = {2005},
volume = {25},
number = {1},
pages = {5--35},
pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-review-mmta.pdf},
abstract = {
Efficient and effective handling of video documents depends on the availability
of indexes. Manual indexing is unfeasible for large video collections. In this
paper we survey several methods aiming at automating this time and resource
consuming process. Good reviews on single modality based video indexing have
appeared in literature. Effective indexing, however, requires a multimodal
approach in which either the most appropriate modality is selected or the
different modalities are used in collaborative fashion. Therefore, instead of
separately treating the different information sources involved, and their
specific algorithms, we focus on the similarities and differences between the
modalities. To that end we put forward a unifying and multimodal framework,
which views a video document from the perspective of its author. This framework
forms the guiding principle for identifying index types, for which automatic
methods are found in literature. It furthermore forms the basis for
categorizing these different methods.
}
}