articles.bib

@comment{{This file has been generated by bib2bib 1.96}}
@comment{{Command line: ./bib2bib -ob articles.bib -c '$type = "ARTICLE"' -s year -r mediamill.bib}}
@article{HabibianCVIU14,
  author = {Amirhossein Habibian and Cees G. M. Snoek},
  title = {Recommendations for Recognizing Video Events by Concept Vocabularies},
  journal = {Computer Vision and Image Understanding},
  pages = {},
  month = {},
  year = {2014},
  volume = {},
  number = {},
  pdf = {},
  note = {In press},
  abstract = {
    		
  		}
}
@article{KordumovaMMTA14,
  author = {Svetlana Kordumova and Xirong Li and Cees G. M. Snoek},
  title = {Best Practices for Learning Video Concept Detectors from Social Media Examples},
  journal = {Multimedia Tools and Applications},
  pages = {},
  month = {},
  year = {2014},
  volume = {},
  number = {},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/kordumova-practices-mmta.pdf},
  note = {In press},
  abstract = {
  
  		Learning video concept detectors from social media sources, such as Flickr images and YouTube videos, has the potential to address a wide variety of concept queries for video search. While the potential has been recognized by many, and progress on the topic has been impressive, we argue that key questions crucial to know how to learn effective video concept detectors from social media examples? remain open. As an initial attempt to answer these questions, we conduct an experimental study using a video search engine which is capable of learning concept detectors from social media examples, be it socially tagged videos or socially tagged images. Within the video search engine we investigate three strategies for positive example selection, three negative example selection strategies and three learning strategies. The performance is evaluated on the challenging TRECVID 2012 benchmark consisting of 600 h of Internet video. From the experiments we derive four best practices: (1) tagged images are a better source for learning video concepts than tagged videos, (2) selecting tag relevant positive training examples is always beneficial, (3) selecting relevant negative examples is advantageous and should be treated differently for video and image sources, and (4) learning concept detectors with selected relevant training data before learning is better then incorporating the relevance during the learning process. The best practices within our video search engine lead to state-of-the-art performance in the TRECVID 2013 benchmark for concept detection without manually provided annotations.
    		
  		}
}
@article{MyersMVA14,
  author = {Gregory K. Myers and Ramesh Nallapati and Julien {van Hout} and Stephanie Pancoast and Ram Nevatia and Chen Sun and Amirhossein Habibian and Dennis C. Koelma and Koen E. A. van de Sande and Arnold W. M. Smeulders and Cees G. M. Snoek},
  title = {Evaluating Multimedia Features and Fusion for Example-based Event Detection},
  journal = {Machine Vision and Applications},
  pages = {17-32},
  month = {January},
  year = {2014},
  volume = {25},
  number = {1},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/myers-features-fusion-events-mva.pdf},
  abstract = {  
  
  		Multimedia event detection (MED)is a challenging
		problem because of the heterogeneous content and variable
		quality found in large collections of Internet videos. To
		study the value of multimedia features and fusion for representing
		and learning events from a set of example video clips,
		we created SESAME, a system for video SEarch with Speed
		and Accuracy for Multimedia Events. SESAME includes
		multiple bag-of-words event classifiers based on single data
		types: low-level visual, motion, and audio features; high-level
		semantic visual concepts; and automatic speech recognition.
		Event detection performance was evaluated for each
		event classifier. The performance of low-level visual and
		motion features was improved by the use of difference coding.
		The accuracy of the visual concepts was nearly as strong
		as that of the low-level visual features. Experiments with a
		number of fusion methods for combining the event detection
		scores from these classifiers revealed that simple fusion
		methods, such as arithmetic mean, perform as well as or better
		than other, more complex fusion methods. SESAME's 
		performance in the 2012 TRECVID MED evaluation was
		one of the best reported.
    		
  		}
}
@article{LiTMM13,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
  title = {Bootstrapping Visual Categorization with Relevant Negatives},
  journal = {{IEEE} Transactions on Multimedia},
  pages = {933--945},
  month = {June},
  year = {2013},
  volume = {15},
  number = {4},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-negative-tmm.pdf},
  abstract = {
  
		Learning classifiers for many visual concepts are important
		for image categorization and retrieval. As a classifier tends
		to misclassify negative examples which are visually similar to positive
		ones, inclusion of such misclassified and thus relevant negatives
		should be stressed during learning. User-tagged images are abundant
		online, but which images are the relevant negatives remains
		unclear. Sampling negatives at random is the de facto standard in
		the literature. In this paper, we go beyond random sampling by
		proposing Negative Bootstrap. Given a visual concept and a few
		positive examples, the new algorithm iteratively finds relevant negatives.
		Per iteration, we learn from a small proportion of many
		user-tagged images, yielding an ensemble of meta classifiers. For
		efficient classification, we introduce Model Compression such that
		the classification time is independent of the ensemble size. Compared
		with the state of the art, we obtain relative gains of 14\% and
		18\% on two present-day benchmarks in terms of mean average
		precision. For concept search in one million images, model compression
		reduces the search time from over 20 h to approximately
		6 min. The effectiveness and efficiency, without the need of manually
		labeling any negatives, make negative bootstrap appealing for
		learning better visual concept classifiers.
    		
  		}
}
@article{HuurninkTMM12,
  author = {Bouke Huurnink and Cees G. M. Snoek and Maarten {de Rijke} and Arnold W. M. Smeulders},
  title = {Content-Based Analysis Improves Audiovisual Archive Retrieval},
  journal = {{IEEE} Transactions on Multimedia},
  pages = {1166--1178},
  month = {August},
  year = {2012},
  volume = {14},
  number = {4},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/huurnink-archive-tmm.pdf},
  abstract = {
  		 
  		 Content-based video retrieval is maturing to the point where it 
  		 can be used in real-world retrieval practices. One such practice 
  		 is the audiovisual archive, whose users increasingly require 
  		 fine-grained access to broadcast television content. In this 
  		 paper, we take into account the information needs and retrieval 
  		 data already present in the audiovisual archive, and demonstrate
		 that retrieval performance can be significantly improved when 
		 content-based methods are applied to search. To the best of our 
		 knowledge, this is the first time that the practice of an 
		 audiovisual archive has been taken into account for quantitative 
		 retrieval evaluation. To arrive at our main result, we propose 
		 an evaluation methodology tailored to the specific needs and 
		 circumstances of the audiovisual archive, which are typically 
		 missed by existing evaluation initiatives. We utilize logged 
		 searches, content purchases, session information, and simulators 
		 to create realistic query sets and relevance judgments. To 
		 reflect the retrieval practice of both the archive and the video 
		 retrieval community as closely as possible, our experiments with 
		 three video search engines incorporate archive-created catalog 
		 entries as well as state-of-the-art multimedia content analysis 
		 results. A detailed query-level analysis indicates that 
		 individual content-based retrieval methods such as 
		 transcript-based retrieval and concept-based retrieval yield 
		 approximately equal performance gains. When combined, we find 
		 that content-based video retrieval incorporated into the 
		 archive’s practice results in significant performance increases 
		 for shot retrieval and for retrieving entire television programs. 
		 The time has come for audiovisual archives to start accommodating 
		 content-based video retrieval methods into their daily practice.
    		
  		}
}
@article{LiTMM12,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  title = {Harvesting Social Images for Bi-Concept Search},
  journal = {{IEEE} Transactions on Multimedia},
  pages = {1091--1104},
  month = {August},
  year = {2012},
  volume = {14},
  number = {4},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-biconcept-tmm.pdf},
  abstract = {
  		 
  		 Searching for the co-occurrence of two visual concepts in unlabeled 
  		 images is an important step towards answering complex user queries. 
  		 Traditional visual search methods use combinations of the confidence 
  		 scores of individual concept detectors to tackle such queries. In 
  		 this paper we introduce the notion of bi-concepts, a new concept-based 
  		 retrieval method that is directly learned from social-tagged images. 
  		 As the number of potential bi-concepts is gigantic, manually collecting 
  		 training examples is infeasible. Instead, we propose a multimedia 
  		 framework to collect de-noised positive as well as informative negative 
  		 training examples from the social web, to learn bi-concept detectors 
  		 from these examples, and to apply them in a search engine for retrieving 
  		 bi-concepts in unlabeled images. We study the behavior of our bi-concept 
  		 search engine using 1.2M social-tagged images as a data source. Our 
  		 experiments indicate that harvesting examples for bi-concepts differs 
  		 from traditional single-concept methods, yet the examples can be 
  		 collected with high accuracy using a multi-modal approach. We find 
  		 that directly learning bi-concepts is better than oracle linear fusion 
  		 of single-concept detectors, with a relative improvement of 100\%. 
  		 This study reveals the potential of learning high-order semantics 
  		 from social images, for free, suggesting promising new lines of research.
    		
  		}
}
@article{GavvesCVIU12,
  author = {Efstratios Gavves and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Visual Synonyms for Landmark Image Retrieval},
  journal = {Computer Vision and Image Understanding},
  pages = {238--249},
  month = {February},
  year = {2012},
  volume = {116},
  number = {2},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gavves-synonyms-cviu.pdf},
  abstract = {
  		 
  		 In this paper, we address the incoherence problem of the visual 
  		 words in bag-of-words vocabularies. Different from existing work, 
  		 which assigns words based on closeness in descriptor space, we 
  		 focus on identifying pairs of independent, distant words -- the 
  		 visual synonyms -- that are likely to host image patches of similar 
  		 visual reality. We focus on landmark images, where the image geometry 
  		 guides the detection of synonym pairs. Image geometry is used to 
  		 find those image features that lie in the nearly identical physical 
  		 location, yet are assigned to different words of the visual 
  		 vocabulary. Defined in this way, we evaluate the validity of visual 
  		 synonyms. We also examine the closeness of synonyms in the 
  		 L2-normalized feature space. We show that visual synonyms may 
  		 successfully be used for vocabulary reduction. Furthermore, we show 
  		 that combining the reduced visual vocabularies with synonym 
  		 augmentation, we perform on par with the state-of-the-art 
  		 bag-of-words approach, while having a 98\% smaller vocabulary.
    		
  		}
}
@article{StegginkMS11,
  author = {Jeroen Steggink and Cees G. M. Snoek},
  title = {Adding Semantics to Image-Region Annotations with the Name-It-Game},
  journal = {Multimedia Systems},
  pages = {367--378},
  month = {October},
  year = {2011},
  volume = {17},
  number = {5},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/steggink-name-it-game-mmsys.pdf},
  abstract = {
  		 
  		 In this paper we present the Name-It-Game, an interactive multimedia 
  		 game fostering the swift creation of a large data set of region-based 
  		 image annotations. Compared to existing annotation games, we consider 
  		 an added semantic structure, by means of the WordNet ontology, the main 
  		 innovation of the Name-It-Game. Using an ontology-powered game, instead 
  		 of the more traditional annotation tools, potentially makes region-based 
  		 image labeling more fun and accessible for every type of user. However, 
  		 the current games often present the players with hard-to-guess objects. 
  		 To prevent this from happening in the Name-It-Game, we successfully 
  		 identify WordNet categories which filter out hard-to-guess objects. To 
  		 verify the speed of the annotation process, we compare the online 
  		 Name-It-Game with a desktop tool with similar features. Results show 
  		 that the Name-It-Game outperforms this tool for semantic region-based 
  		 image labeling. Lastly, we measure the accuracy of the produced 
  		 segmentations and compare them with carefully created LabelMe 
  		 segmentations. Judging from the quantitative and qualitative results, 
  		 we believe the segmentations are competitive to those of LabelMe, 
  		 especially when averaged over multiple games. By adding semantics to 
  		 region-based image annotations, using the Name-It-Game, we have opened 
  		 up an efficient means to provide precious labels in a playful manner.
    		
  		}
}
@article{SandeTMM11,
  author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  title = {Empowering Visual Categorization with the {GPU}},
  journal = {{IEEE} Transactions on Multimedia},
  pages = {60--70},
  month = {February},
  year = {2011},
  volume = {13},
  number = {1},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-categorization-gpu-tmm.pdf},
  abstract = {
   		  
   		  Visual categorization is important to manage large collections of 
   		  digital images and video, where textual meta-data is often incomplete 
   		  or simply unavailable. The bag-of-words model has become the most 
   		  powerful method for visual categorization of images and video. 
   		  Despite its high accuracy, a severe drawback of this model is its 
   		  high computational cost. As the trend to increase computational 
   		  power in newer CPU and GPU architectures is to increase their level 
   		  of parallelism, exploiting this parallelism becomes an important 
   		  direction to handle the computational cost of the bag-of-words 
   		  approach. When optimizing a system based on the bag-of-words approach, 
   		  the goal is to minimize the time it takes to process batches of 
   		  images. Additionally, we also consider power usage as an evaluation 
   		  metric. In this paper, we analyze the bag-of-words model for visual 
   		  categorization in terms of computational cost and identify two major 
   		  bottlenecks: the quantization step and the classification step. We 
   		  address these two bottlenecks by proposing two efficient algorithms 
   		  for quantization and classification by exploiting the GPU hardware 
   		  and the CUDA parallel programming model. The algorithms are designed 
   		  to (1) keep categorization accuracy intact, (2) decompose the problem 
   		  and (3) give the same numerical results. In the experiments on large 
   		  scale datasets it is shown that, by using a parallel implementation 
   		  on the Geforce GTX260 GPU, classifying unseen images is 4.8 times 
   		  faster than a quad-core CPU version on the Core i7 920, while giving 
   		  the exact same numerical results. In addition, we show how the 
   		  algorithms can be generalized to other applications, such as text 
   		  retrieval and video retrieval. Moreover, when the obtained speedup is 
   		  used to process extra video frames in a video retrieval benchmark, 
   		  the accuracy of visual categorization is improved by 29\%. 

                }
}
@article{SandePAMI10,
  author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  title = {Evaluating Color Descriptors for Object and Scene Recognition},
  journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  pages = {1582--1596},
  month = {September},
  year = {2010},
  volume = {32},
  number = {9},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-colordescriptors-pami.pdf},
  software = {http://www.colordescriptors.com},
  abstract = {
  		 
  		  Image category recognition is important to access visual information 
  		  on the level of objects and scene types. So far, intensity-based 
  		  descriptors have been widely used for feature extraction at salient 
  		  points. To increase illumination invariance and discriminative power, 
  		  color descriptors have been proposed. Because many different descriptors 
  		  exist, a structured overview is required of color invariant descriptors 
  		  in the context of image category recognition. Therefore, this paper 
  		  studies the invariance properties and the distinctiveness of color 
  		  descriptors in a structured way. The analytical invariance properties 
  		  of color descriptors are explored, using a taxonomy based on invariance 
  		  properties with respect to photometric transformations, and tested 
  		  experimentally using a dataset with known illumination conditions. In 
  		  addition, the distinctiveness of color descriptors is assessed 
  		  experimentally using two benchmarks, one from the image domain and one 
  		  from the video domain. From the theoretical and experimental results, 
  		  it can be derived that invariance to light intensity changes and light 
  		  color changes affects category recognition. The results reveal further 
  		  that, for light intensity changes, the usefulness of invariance is 
  		  category-specific. Overall, when choosing a single descriptor and no 
  		  prior knowledge about the dataset and object and scene categories is 
  		  available, the OpponentSIFT is recommended. Furthermore, a combined set 
  		  of color descriptors outperforms intensity-based SIFT and improves 
  		  category recognition by 8\% on the PASCAL VOC 2007 and by 7\% on the 
  		  MediaMill Challenge.
    		
  		}
}
@article{ByrneMMTA10,
  author = {Daragh Byrne and Aiden R. Doherty and Cees G. M. Snoek and Gareth J. F. Jones and Alan F. Smeaton},
  title = {Everyday Concept Detection in Visual Lifelogs: Validation, Relationships and Trends},
  journal = {Multimedia Tools and Applications},
  pages = {119--144},
  month = {August},
  year = {2010},
  volume = {49},
  number = {1},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/byrne-everyday-concept-detection-mmta.pdf},
  abstract = {
  
  		 The Microsoft SenseCam is a small lightweight wearable camera used to
		 passively capture photos and other sensor readings from a user’s day-to-day activities.
		 It captures on average 3,000 images in a typical day, equating to almost 1 million
		 images per year. It can be used to aid memory by creating a personal multimedia
		 lifelog, or visual recording of the wearer’s life. However the sheer volume of image
		 data captured within a visual lifelog creates a number of challenges, particularly for
		 locating relevant content. Within this work, we explore the applicability of semantic
		 concept detection, a method often used within video retrieval, on the domain of
		 visual lifelogs. Our concept detector models the correspondence between low-level
		 visual features and high-level semantic concepts (such as indoors, outdoors, people,
		 buildings, etc.) using supervised machine learning. By doing so it determines the
		 probability of a concept’s presence. We apply detection of 27 everyday semantic
		 concepts on a lifelog collection composed of 257,518 SenseCam images from 5
		 users. The results were evaluated on a subset of 95,907 images, to determine the
		 accuracy for detection of each semantic concept. We conducted further analysis
		 on the temporal consistency, co-occurance and relationships within the detected
		 concepts to more extensively investigate the robustness of the detectors within this
		 domain.
  
  		}
}
@article{SnoekCOM10,
  author = {Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Visual-Concept Search Solved?},
  journal = {{IEEE} Computer},
  pages = {76--78},
  month = {June},
  year = {2010},
  volume = {43},
  number = {6},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-smeulders-solved-computer.pdf},
  abstract = {
  		 
  		 Progress in visual-concept search suggests that machine understanding of images 
  		 is within reach.
    		
  		}
}
@article{RooijCGA10,
  author = {Ork de Rooij and Marcel Worring and Jack J. van Wijk},
  title = {MediaTable: Interactive Categorization of Multimedia Collections},
  journal = {IEEE Computer Graphics and Applications},
  pages = {42--51},
  month = {May},
  year = {2010},
  volume = {30},
  number = {5},
  pdf = {http://www.science.uva.nl/research/publications/2010/deRooijCGA2010},
  abstract = {
  
    		
  		}
}
@article{GemertCVIU10,
  author = {Jan C. van Gemert and Cees G. M. Snoek and Cor J. Veenman and Arnold W. M. Smeulders and Jan-Mark Geusebroek},
  title = {Comparing Compact Codebooks for Visual Categorization},
  journal = {Computer Vision and Image Understanding},
  pages = {450--462},
  month = {April},
  year = {2010},
  volume = {114},
  number = {4},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gemert-compact-codebooks-cviu.pdf},
  abstract = {
  		 
  		 In the face of current large-scale video libraries, the practical applicability of 
  		 content-based indexing algorithms is constrained by their efficiency. This paper 
  		 strives for efficient large-scale video indexing by comparing various visual-based 
  		 concept categorization techniques. In visual categorization, the popular codebook 
  		 model has shown excellent categorization performance. The codebook model represents
		 continuous visual features by discrete prototypes predefined in a vocabulary. The 
		 vocabulary size has a major impact on categorization efficiency, where a more compact 
		 vocabulary is more efficient. However, smaller vocabularies typically score lower on 
		 classification performance than larger vocabularies. This paper compares four approaches 
		 to achieve a compact codebook vocabulary while retaining categorization performance. 
		 For these four methods, we investigate the trade-off between codebook compactness
		 and categorization performance. We evaluate the methods on more than 200 h of challenging 
		 video data with as many as 101 semantic concepts. The results allow us to create a 
		 taxonomy of the four methods based on their efficiency and categorization performance.		  
    		
  		}
}
@article{RooijTMM10,
  author = {Ork de Rooij and Marcel Worring},
  title = {Browsing Video Along Multiple Threads},
  journal = {{IEEE} Transactions on Multimedia},
  pages = {121--130},
  month = {February},
  year = {2010},
  volume = {12},
  number = {2},
  pdf = {http://www.science.uva.nl/research/publications/2010/deRooijITM2010},
  abstract = {
  
    		
  		}
}
@article{LiTMM09,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
  title = {Learning Social Tag Relevance by Neighbor Voting},
  journal = {{IEEE} Transactions on Multimedia},
  pages = {1310--1322},
  month = {November},
  year = {2009},
  volume = {11},
  number = {7},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-socialtagrelevance-tmm.pdf},
  abstract = {
  
  		  Social image analysis and retrieval is important
		  for helping people organize and access the increasing amount
		  of user-tagged multimedia. Since user tagging is known to be
		  uncontrolled, ambiguous, and overly personalized, a fundamental
		  problem is how to interpret the relevance of a user-contributed
		  tag with respect to the visual content the tag is describing.
		  Intuitively, if different persons label visually similar images using
		  the same tags, these tags are likely to reflect objective aspects
		  of the visual content. Starting from this intuition, we propose
		  in this paper a neighbor voting algorithm which accurately and
		  efficiently learns tag relevance by accumulating votes from visual
		  neighbors. Under a set of well defined and realistic assumptions,
		  we prove that our algorithm is a good tag relevance measurement
		  for both image ranking and tag ranking. Three experiments on
		  3.5 million Flickr photos demonstrate the general applicability
		  of our algorithm in both social image retrieval and image tag
		  suggestion. Our tag relevance learning algorithm substantially
		  improves upon baselines for all the experiments. The results
		  suggest that the proposed algorithm is promising for real-world
		  applications.
    		
  		}
}
@article{SnoekFNTIR09,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {Concept-Based Video Retrieval},
  journal = {Foundations and Trends in Information Retrieval},
  pages = {215--322},
  year = {2009},
  volume = {4},
  number = {2},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-concept-based-video-retrieval-fntir.pdf},
  abstract = {
  		 
  		 In this paper, we review 300 references on video retrieval, indicating
		 when text-only solutions are unsatisfactory and showing the promising
		 alternatives which are in majority concept-based. Therefore, central
		 to our discussion is the notion of a semantic concept: an objective
		 linguistic description of an observable entity. Specifically, we present
		 our view on how its automated detection, selection under uncertainty,
		 and interactive usage might solve the major scientific problem for video
		 retrieval: the semantic gap. To bridge the gap, we lay down the anatomy
		 of a concept-based video search engine. We present a component-wise
		 decomposition of such an interdisciplinary multimedia system, covering
		 influences from information retrieval, computer vision, machine learning,
		 and human-computer interaction. For each of the components we
		 review state-of-the-art solutions in the literature, each having different
		 characteristics and merits. Because of these differences, we cannot
		 understand the progress in video retrieval without serious evaluation
  		 efforts such as carried out in the NIST TRECVID benchmark. We
		 discuss its data, tasks, results, and the many derived community
		 initiatives in creating annotations and baselines for repeatable experiments.
		 We conclude with our perspective on future challenges and
		 opportunities.
  		   		 		     		
  		}
}
@article{SmeatonIJIST08,
  author = {Alan F. Smeaton and Peter Wilkins and Marcel Worring and Ork de Rooij and Tat-Seng Chua and Huanbo Luan},
  title = {Content-based Video Retrieval: Three Example Systems from {TRECVid}},
  journal = {International Journal of Imaging Systems and Technology},
  year = {2008},
  volume = {18},
  number = {2--3},
  pages = {195--201},
  pdf = {},
  abstract = {
           		 
		}
}
@article{NguyenJVLC08,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Interactive Access to Large Image Collections using Similarity-based Visualization},
  journal = {Journal of Visual Languages and Computing},
  month = {April},
  year = {2008},
  volume = {19},
  number = {2},
  pages = {203--224},
  pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-similarity-visualization-jvlc.pdf},
  abstract = {
                 
                 Image collections are getting larger and larger. To access those 
                 collections, systems for managing, searching, and browsing are 
                 necessary. Visualization plays an essential role in such systems. 
                 Existing visualization systems do not analyze all the problems 
                 occurring when dealing with large visual collections. In this 
                 paper, we make these problems explicit. From there, we establish 
                 three general requirements: overview, visibility, and structure 
                 preservation. Solutions for each requirement are proposed, as well 
                 as functions balancing the different requirements. We present an 
                 optimal visualization scheme, supporting users in interacting with 
                 large image collections. Experimental results with a collection of 
                 10,000 Corel images, using simulated user actions, show that the 
                 proposed scheme significantly improves performance for a given 
                 task compared to the 2D grid-based visualizations commonly used in 
                 content-based image retrieval.
                                   		 
		}
}
@article{SnoekMM08,
  author = {Cees G. M. Snoek and Marcel Worring and Ork de Rooij and Koen E. A. {van de Sande} and Rong Yan and Alexander G. Hauptmann},
  title = {{VideOlympics}: Real-Time Evaluation of Multimedia Retrieval Systems},
  journal = {{IEEE} MultiMedia},
  pages = {86--91},
  month = {January--March},
  year = {2008},
  volume = {15},
  number = {1},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-videolympics-mm.pdf},
  abstract = {
  		  
  		 Video search is an experience for the senses. As a result, traditional 
  		 information retrieval metrics can't fully measure the quality of a video 
  		 search system. To provide a more interactive assessment of today's video 
  		 search engines, the authors have organized the VideOlympics as a real-time 
  		 evaluation showcase where systems compete to answer specific video searches 
  		 in front of a live audience. At VideOlympics, seeing and hearing is believing.
  		}
}
@article{NguyenTOMCCAP08,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Optimization of Interactive Visual-Similarity-Based Search},
  journal = {{ACM} Transactions on Multimedia Computing, Communications and Applications},
  month = {January},
  year = {2008},
  volume = {4},
  number = {1},
  pages = {7:1--23},
  pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-optimization-tomccap.pdf},
  abstract = {
  		 
		 At one end of the spectrum, research in interactive content-based 
		 retrieval concentrates on machine learning methods for effective 
		 use of relevance feedback. On the other end, the information 
		 visualization community focuses on effective methods for conveying 
		 information to the user. What is lacking is research considering 
		 the information visualization and interactive retrieval as truly 
		 integrated parts of one content-based search system. In such an 
		 integrated system, there are many degrees of freedom like the 
		 similarity function, the number of images to display, the image 
		 size, different visualization modes, and possible feedback modes. 
		 To base the optimal values for all of those on user studies is 
		 unfeasible. We therefore develop search scenarios in which tasks 
		 and user actions are simulated. From there, the proposed scheme is 
		 optimized based on objective constraints and evaluation criteria. 
		 In such a manner, the degrees of freedom are reduced and the 
		 remaining degrees can be evaluated in user studies. In this article, 
		 we present a system that integrates advanced similarity based 
		 visualization with active learning. We have performed extensive 
		 experimentation on interactive category search with different 
		 image collections. The results using the proposed simulation 
		 scheme show that indeed the use of advanced visualization and 
		 active learning pays off in all of these datasets.
		 
		}
}
@article{NguyenTMM07,
  author = {Giang P. Nguyen and Marcel Worring and Arnold W. M. Smeulders},
  title = {Interactive Search by Direct Manipulation of Dissimilarity Space},
  journal = {{IEEE} Transactions on Multimedia},
  month = {November},
  year = {2007},
  volume = {9},
  number = {7},
  pages = {1404--1415},
  pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-dissimilarity-tmm.pdf},
  abstract = {
  
		 In this paper, we argue to learn dissimilarity for interactive search in 
		 content based image retrieval. In literature, dissimilarity is often learned 
		 via the feature space by feature selection, feature weighting or by adjusting 
		 the parameters of a function of the features. Other than existing techniques, 
		 we use feedback to adjust the dissimilarity space independent of feature space. 
		 This has the great advantage that it manipulates dissimilarity directly. To 
		 create a dissimilarity space, we use the method proposed by Pekalska and Duin, 
		 selecting a set of images called prototypes and computing distances to those 
		 prototypes for all images in the collection. After the user gives feedback, 
		 we apply active learning with a one-class support vector machine to decide the 
		 movement of images such that relevant images stay close together while irrelevant 
		 ones are pushed away (the work of Guo ). The dissimilarity space is then adjusted 
		 accordingly. Results on a Corel dataset of 10000 images and a TrecVid collection 
		 of 43907 keyframes show that our proposed approach is not only intuitive, it 
		 also significantly improves the retrieval performance. 
		  		 
		}
}
@article{SeinstraMM07,
  author = {Frank J. Seinstra and Jan-Mark Geusebroek and Dennis Koelma and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  title = {High-Performance Distributed Image and Video Content Analysis with Parallel-Horus},
  journal = {{IEEE} MultiMedia},
  pages = {64--75},
  month = {October--December},
  year = {2007},
  volume = {14},
  number = {4},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/seinstra-parallel-horus-mm.pdf},
  abstract = {
  		  
  		 As the world uses more digital video that requires greater storage space, 
  		 Grid computing is becoming indispensable for urgent problems in multimedia 
  		 content analysis. Parallel-Horus, a support tool for applications in multimedia 
  		 Grid computing, lets users implement multimedia applications as sequential 
  		 programs for efficient execution on clusters and Grids, based on wide-area 
  		 multimedia services.  
    		
  		}
}
@article{SnoekTMM07b,
  author = {Cees G. M. Snoek and Bouke Huurnink and Laura Hollink and Maarten de Rijke and Guus Schreiber and Marcel Worring},
  title = {Adding Semantics to Detectors for Video Retrieval},
  journal = {{IEEE} Transactions on Multimedia},
  month = {August},
  year = {2007},
  volume = {9},
  number = {5},
  pages = {975--986},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-semantics2detectors-tmm.pdf},
  abstract = {
  		 
  		 In this paper, we propose an automatic video retrieval method based on high-level 
  		 concept detectors. Research in video analysis has reached the point where over 100 
  		 concept detectors can be learned in a generic fashion, albeit with mixed performance. 
  		 Such a set of detectors is very small still compared to ontologies aiming to capture 
  		 the full vocabulary a user has. We aim to throw a bridge between the two fields by 
  		 building a multimedia thesaurus, i.e., a set of machine learned concept detectors 
  		 that is enriched with semantic descriptions and semantic structure obtained from 
  		 WordNet. Given a multimodal user query, we identify three strategies to select a 
  		 relevant detector from this thesaurus, namely: text matching, ontology querying, 
  		 and semantic visual querying. We evaluate the methods against the automatic search 
  		 task of the TRECVID 2005 video retrieval benchmark, using a news video archive of 
  		 85 h in combination with a thesaurus of 363 machine learned concept detectors. We 
  		 assess the influence of thesaurus size on video search performance, evaluate and 
  		 compare the multimodal selection strategies for concept detectors, and finally 
  		 discuss their combined potential using oracle fusion. The set of queries in the 
  		 TRECVID 2005 corpus is too small for us to be definite in our conclusions, but the 
  		 results suggest promising new lines of research.
  		 
		}
}
@article{WorringTMM07,
  author = {Marcel Worring and Guus Schreiber},
  title = {Semantic Image and Video Indexing in Broad Domains},
  journal = {{IEEE} Transactions on Multimedia},
  month = {August},
  year = {2007},
  volume = {9},
  number = {5},
  pages = {909--911},
  pdf = {http://www.science.uva.nl/research/mediamill/pub/worring-special-issue-tmm.pdf},
  abstract = {
  		 
  		 The six papers in this special section focus on semantic image and 
  		 video indexing in broad domains. To bring semantics to the user in 
  		 broad domains both the indexing and retrieval step have to be considered. 
  		 The papers here address both steps and the relation to ontologies.
  		 
		}
}
@article{SnoekTMM07,
  author = {Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
  title = {A Learned Lexicon-Driven Paradigm for Interactive Video Retrieval},
  journal = {{IEEE} Transactions on Multimedia},
  month = {February},
  year = {2007},
  volume = {9},
  number = {2},
  pages = {280--292},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-lexicon-tmm.pdf},
  abstract = {
		
		 Effective video retrieval is the result of an interplay between
		 interactive query selection, advanced visualization of results, and
		 a goal-oriented human user. Traditional interactive video retrieval
		 approaches emphasize paradigms, such as query-by-keyword and
		 query-by-example, to aid the user in the search for relevant
		 footage. However, recent results in automatic indexing indicate that
		 query-by-concept is becoming a viable resource for interactive
		 retrieval also. We propose in this paper a new video retrieval
		 paradigm. The core of the paradigm is formed by first detecting a
		 large lexicon of semantic concepts. From there, we combine
		 query-by-concept, query-by-example, query-by-keyword, and user
		 interaction into the \emph{MediaMill} semantic video search engine.
		 To measure the impact of increasing lexicon size on interactive
		 video retrieval performance, we performed two experiments against
		 the 2004 and 2005 NIST TRECVID benchmarks, using lexicons containing
		 32 and 101 concepts respectively. The results suggest that from all
		 factors that play a role in interactive retrieval, a large lexicon
		 of semantic concepts matters most. Indeed, by exploiting large
		 lexicons, many video search questions are solvable without using
		 query-by-keyword and query-by-example. What is more, we show that
		 the lexicon-driven search engine outperforms all state-of-the-art
		 video retrieval systems in both TRECVID 2004 and 2005.
  		 
		}
}
@article{SnoekPAMI06,
  author = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra and Arnold W. M. Smeulders},
  title = {The Semantic Pathfinder: Using an Authoring Metaphor for Generic Multimedia Indexing},
  journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  month = {October},
  year = {2006},
  volume = {28},
  number = {10},
  pages = {1678--1689},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-pathfinder-pami.pdf},
  abstract = {
  
  		 This paper presents the semantic pathfinder architecture for
  		 generic indexing of multimedia archives. The semantic pathfinder
  		 extracts semantic concepts from video by exploring different paths
  		 through three consecutive analysis steps, which we derive from the
  		 observation that produced video is the result of an
  		 authoring-driven process. We exploit this \emph{authoring
  		 metaphor} for machine-driven understanding. The pathfinder starts
  		 with the content analysis step. In this analysis step, we follow a
  		 data-driven approach of indexing semantics. The style analysis
  		 step is the second analysis step. Here we tackle the indexing
  		 problem by viewing a video from the perspective of production.
  		 Finally, in the context analysis step, we view semantics in
  		 context. The virtue of the semantic pathfinder is its ability to
  		 learn the best path of analysis steps on a per-concept basis. To
  		 show the generality of this novel indexing approach we develop
      		 detectors for a lexicon of 32 concepts and we evaluate the
  		 semantic pathfinder against the 2004 NIST TRECVID video retrieval
  	  	 benchmark, using a news archive of 64 hours. Top ranking
  		 performance in the semantic concept detection task indicates the
  		 merit of the semantic pathfinder for generic indexing of
		 multimedia archives.
  
  		}
}
@article{SnoekTOMCCAP06,
  author = {Cees G. M. Snoek and Marcel Worring and Alexander G. Hauptmann},
  title = {Learning Rich Semantics from News Video Archives by Style Analysis},
  journal = {{ACM} Transactions on Multimedia Computing, Communications and Applications},
  month = {May},
  year = {2006},
  volume = {2},
  number = {2},
  pages = {91--108},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-style-tomccap.pdf},
  abstract = {
  
  		 We propose a generic and robust framework for news video indexing, which 
  		 we found on a broadcast news production model. We identify within this 
  		 model four production phases, each providing useful metadata for annotation. 
  		 In contrast to semi-automatic indexing approaches, which exploit this 
  		 information at production time, we adhere to an automatic data-driven 
  		 approach. To that end, we analyze a digital news video using a separate 
  		 set of multimodal detectors for each production phase. By combining the 
  		 resulting production-derived features into a statistical classifier 
  		 ensemble, the framework facilitates robust classification of several rich 
  		 semantic concepts in news video; rich meaning that concepts share many 
  		 similarities in their production process. Experiments on an archive of 
  		 120 hours of news video, from the 2003 TRECVID benchmark, show that a 
  		 combined analysis of production phases yields the best results. In addition, 
  		 we demonstrate that the accuracy of the proposed style analysis framework 
  		 for classification of several rich semantic concepts is state-of-the-art.
  		 
		}
}
@article{HollinkVISP05,
  author = {Laura Hollink and Giang Nguyen and Dennis C. Koelma and Guus Schreiber and Marcel Worring},
  title = {Assessing user behaviour in news video retrieval},
  journal = {IEE on Vision, Image and Signal Processing},
  month = {December},
  year = {2005},
  volume = {152},
  number = {6},
  pages = {911-918},
  pdf = {http://staff.science.uva.nl/~giangnp/pubs/pdf/2005/Hollink05.pdf},
  abstract = {
  		
  		 The results of a study are presented, in which people queried a news
		 archive using an interactive video retrieval system. 242 search sessions
		 by 39 participants on 24 topics were assessed. Before, during and after
		 the study, participants filled in questionnaires about their expectations
		 of a search. The questionnaire data, logged user actions on the system,
		 queries formulated by users, and a quality measure of each search were
		 studied. The results of the study show that topics concerning 'specific'
		 people or objects were better retrieved than topics concerning 'general'
		 objects and scenes. Users were able to estimate the overall quality of a
		 search but did not know when the optimal result was reached within the
		 search process. Analysis of the results at various stages in the retrieval
		 process suggests that retrieval based on transcriptions of the speech in
		 video data adds more to the average precision of the result than
		 content-based image retrieval based on low-level visual features. The
		 latter is particularly useful in providing the user with an overview of
		 the dataset and thus an indication of the success of a search. Based on
		 the results, implications for the design of user interfaces of video 
		 retrieval systems are discussed.
  		
  		}
}
@article{NguyenMS05,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Relevance feedback based saliency adaptation in {CBIR}},
  journal = {{ACM} Springer Multimedia Systems},
  month = {October},
  year = {2005},
  volume = {10},
  number = {6},
  pages = {499--512},
  pdf = {http://staff.science.uva.nl/~giangnp/pubs/pdf/2005/giangACM_MS05.pdf},
  abstract = {
  		
  		 Content-based image retrieval ({CBIR}) has been under investigation
		 for a long time with many systems built to meet different
		 application demands. However, in all systems, there is still a gap
		 between the user's expectation and the system's retrieval
		 capabilities. Therefore, user interaction is an essential
		 component of any {CBIR} system. Interaction up to now has mostly
		 focused on changing global image features or similarities between
		 images. We consider the interaction with salient details in the
		 image i.e. points, lines, and regions. Interactive salient detail
		 definition goes further than summarizing the image into a set of
		 salient details. We aim to dynamically update the user- and
		 context-dependent definition of saliency based on relevance
		 feedback. To that end, we propose an interaction framework for
		 salient details from the perspective of the user. A number of
		 instantiations of the framework are presented. Finally, we apply
		 our approach for query refinement in detail based image retrieval
		 system with salient points and regions. Experimental results prove
		 the effectiveness of adapting the saliency from user feedback in
		 the retrieval process.
  
  		}
}
@article{SnoekTMM05,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {Multimedia Event-Based Video Indexing using Time Intervals},
  journal = {{IEEE} Transactions on Multimedia},
  month = {August},
  year = {2005},
  volume = {7},
  number = {4},
  pages = {638--647},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-time-mm.pdf},
  abstract = {
  		 
  		 We propose the Time Interval Multimedia Event (TIME) framework as a robust 
  		 approach for classification of semantic events in multimodal video documents. 
  		 The representation used in TIME extends the Allen time relations and allows 
  		 for proper inclusion of context and synchronization of the heterogeneous 
  		 information sources involved in multimodal video analysis. To demonstrate the 
  		 viability of our approach, it was evaluated on the domains of soccer and news 
  		 broadcasts. For automatic classification of semantic events, we compare three 
  		 different machine learning techniques, i.c. C4.5 decision tree, Maximum 
  		 Entropy, and Support Vector Machine. The results show that semantic video 
  		 indexing results significantly benefit from using the TIME framework.
  		 
		}
}
@article{SnoekMTAP05,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {Multimodal Video Indexing: A Review of the State-of-the-art},
  journal = {Multimedia Tools and Applications},
  month = {January},
  year = {2005},
  volume = {25},
  number = {1},
  pages = {5--35},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-review-mmta.pdf},
  abstract = {
  		 
  		 Efficient and effective handling of video documents depends on the availability 
  		 of indexes. Manual indexing is unfeasible for large video collections. In this 
  		 paper we survey several methods aiming at automating this time and resource 
  		 consuming process. Good reviews on single modality based video indexing have 
  		 appeared in literature. Effective indexing, however, requires a multimodal 
  		 approach in which either the most appropriate modality is selected or the 
  		 different modalities are used in collaborative fashion. Therefore, instead of 
  		 separately treating the different information sources involved, and their 
  		 specific algorithms, we focus on the similarities and differences between the 
  		 modalities. To that end we put forward a unifying and multimodal framework, 
  		 which views a video document from the perspective of its author. This framework 
  		 forms the guiding principle for identifying index types, for which automatic 
  		 methods are found in literature. It furthermore forms the basis for 
  		 categorizing these different methods.
  		 
		}
}